diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs index 90e3f0f8..cda8bc5f 100644 --- a/benchmarks/haskell/Benchmarks.hs +++ b/benchmarks/haskell/Benchmarks.hs @@ -43,6 +43,7 @@ main = do , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmark "ascii") , env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark "russian") , env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese") + , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmarkASCII) , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯" , env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark , FileRead.benchmark (tf "russian.txt") diff --git a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs index 3a54b6b8..addb1a94 100644 --- a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs +++ b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs @@ -17,6 +17,7 @@ module Benchmarks.DecodeUtf8 ( initEnv , benchmark + , benchmarkASCII ) where import Foreign.C.Types @@ -67,6 +68,17 @@ benchmark kind ~(bs, lbs) = , bench "LazyStringUtf8Length" $ nf (length . U8.toString) lbs ] +benchmarkASCII :: Env -> Benchmark +benchmarkASCII ~(bs, lbs) = + bgroup "DecodeASCII" + [ C.bench "strict decodeUtf8" $ nf T.decodeUtf8 bs + , C.bench "strict decodeLatin1" $ nf T.decodeLatin1 bs + , C.bench "strict decodeASCII" $ nf T.decodeASCII bs + , C.bench "lazy decodeUtf8" $ nf TL.decodeUtf8 lbs + , C.bench "lazy decodeLatin1" $ nf TL.decodeLatin1 lbs + , C.bench "lazy decodeASCII" $ nf TL.decodeASCII lbs + ] + iconv :: B.ByteString -> IO CInt iconv (PS fp off len) = withForeignPtr fp $ \ptr -> time_iconv (ptr `plusPtr` off) (fromIntegral len) diff --git a/cbits/cbits.c b/cbits/cbits.c index 46357011..3b7e6181 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -11,6 +11,11 @@ #include #include "text_cbits.h" +#if defined(__x86_64__) +#include +#include +#endif + void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff, size_t n) { @@ -82,6 +87,23 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src, while (p != srcend && (uintptr_t)p & 0x3) *dest++ = *p++; +#if defined(__x86_64__) + /* All the intrinsics used here are from SSE2, + * so every x86_64 CPU supports them. + */ + const __m128i zeros = _mm_set1_epi32(0); + while (p < srcend - 3) { + /* Load 4 bytes of ASCII data */ + const __m128i ascii = _mm_cvtsi32_si128(*((const uint32_t *)p)); + /* Interleave with zeros */ + const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros); + /* Store the resulting 8 bytes into destination */ + _mm_storel_epi64((__m128i *)dest, utf16); + + dest += 4; + p += 4; + } +#else /* iterate over 32-bit aligned loads */ while (p < srcend - 3) { const uint32_t w = *((const uint32_t *)p); @@ -93,6 +115,7 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src, p += 4; } +#endif #endif /* handle unaligned suffix */