diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs
index 90e3f0f8..cda8bc5f 100644
--- a/benchmarks/haskell/Benchmarks.hs
+++ b/benchmarks/haskell/Benchmarks.hs
@@ -43,6 +43,7 @@ main = do
         , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmark "ascii")
         , env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark  "russian")
         , env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese")
+        , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmarkASCII)
         , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
         , env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark
         , FileRead.benchmark (tf "russian.txt")
diff --git a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
index 3a54b6b8..addb1a94 100644
--- a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
+++ b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
@@ -17,6 +17,7 @@
 module Benchmarks.DecodeUtf8
     ( initEnv
     , benchmark
+    , benchmarkASCII
     ) where
 
 import Foreign.C.Types
@@ -67,6 +68,17 @@ benchmark kind ~(bs, lbs) =
         , bench "LazyStringUtf8Length" $ nf (length . U8.toString) lbs
         ]
 
+benchmarkASCII :: Env -> Benchmark
+benchmarkASCII ~(bs, lbs) =
+    bgroup "DecodeASCII"
+        [ C.bench "strict decodeUtf8" $ nf T.decodeUtf8 bs
+        , C.bench "strict decodeLatin1" $ nf T.decodeLatin1 bs
+        , C.bench "strict decodeASCII" $ nf T.decodeASCII bs
+        , C.bench "lazy decodeUtf8" $ nf TL.decodeUtf8 lbs
+        , C.bench "lazy decodeLatin1" $ nf TL.decodeLatin1 lbs
+        , C.bench "lazy decodeASCII" $ nf TL.decodeASCII lbs
+        ]
+
 iconv :: B.ByteString -> IO CInt
 iconv (PS fp off len) = withForeignPtr fp $ \ptr ->
                         time_iconv (ptr `plusPtr` off) (fromIntegral len)
diff --git a/cbits/cbits.c b/cbits/cbits.c
index 46357011..3b7e6181 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -11,6 +11,11 @@
 #include <stdio.h>
 #include "text_cbits.h"
 
+#if defined(__x86_64__)
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#endif
+
 void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
 		     size_t n)
 {
@@ -82,6 +87,23 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
   while (p != srcend && (uintptr_t)p & 0x3)
     *dest++ = *p++;
 
+#if defined(__x86_64__)
+  /* All the intrinsics used here are from SSE2,
+   * so every x86_64 CPU supports them.
+   */
+  const __m128i zeros = _mm_set1_epi32(0);
+  while (p < srcend - 3) {
+    /* Load 4 bytes of ASCII data */
+    const __m128i ascii = _mm_cvtsi32_si128(*((const uint32_t *)p));
+    /* Interleave with zeros */
+    const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
+    /* Store the resulting 8 bytes into destination */
+    _mm_storel_epi64((__m128i *)dest, utf16);
+
+    dest += 4;
+    p += 4;
+  }
+#else
   /* iterate over 32-bit aligned loads */
   while (p < srcend - 3) {
     const uint32_t w = *((const uint32_t *)p);
@@ -93,6 +115,7 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
 
     p += 4;
   }
+#endif
 #endif
 
   /* handle unaligned suffix */