diff --git a/pyfastani/_fasta.pyx b/pyfastani/_fasta.pyx
index c4381ea79399721d2cc2f00f0a84c9e4da48d499..394ba0846c068f01ed6d5ea579ee4bd2a0dd2dc8 100644
--- a/pyfastani/_fasta.pyx
+++ b/pyfastani/_fasta.pyx
@@ -11,7 +11,7 @@ from libc.errno cimport errno
 from libc.string cimport strlen, memcpy, memset
 
 from _unicode cimport PyUnicode_1BYTE_KIND, PyUnicode_FromKindAndData
-from _simd cimport copy_upper
+from _sequtils cimport copy_upper
 
 cdef extern from "<ctype.h>" nogil:
     cdef int toupper(int c)
diff --git a/pyfastani/_fastani.pyx b/pyfastani/_fastani.pyx
index 2afb2633b2a5526cb62fe19e5063e438f8e2c41f..663ed817b6e370980ebd0f18f37d10af7024aa04 100644
--- a/pyfastani/_fastani.pyx
+++ b/pyfastani/_fastani.pyx
@@ -46,8 +46,9 @@ from fastani.map.base_types cimport (
 # HACK: we need kseq_t* as a template argument, which is not supported by
 #       Cython at the moment, so we just `typedef kseq_t* kseq_ptr_t` in
 #       an external C++ header to make Cython happy
-from _utils cimport kseq_ptr_t, toupper, complement, distance
+from _utils cimport kseq_ptr_t, toupper, distance
 from _unicode cimport *
+from _sequtils cimport copy_upper, reverse_complement
 
 
 # --- Python imports ---------------------------------------------------------
@@ -88,10 +89,17 @@ cdef ssize_t _read_nucl(
     else:
         length = 0
 
-    for j in range(length):
-        nuc = toupper(<int> PyUnicode_READ(kind, data, i + j))
-        fwd[_MAX_KMER_SIZE + j] = nuc
-        bwd[_MAX_KMER_SIZE - j - 1] = complement(nuc)
+    # if UCS-1, bytes are next to each other, so we can use the SIMD
+    # implementations to copy into uppercase
+    if kind == PyUnicode_1BYTE_KIND:
+        copy_upper(&fwd[_MAX_KMER_SIZE], &(<char*> data)[i], length)
+    else:
+        for j in range(length):
+            fwd[_MAX_KMER_SIZE + j] = toupper(<int> PyUnicode_READ(kind, data, i + j))
+
+    # reverse complement in backward buffer
+    reverse_complement(&bwd[_MAX_KMER_SIZE - length], &fwd[_MAX_KMER_SIZE], length)
+
 
     return length
 
diff --git a/pyfastani/_sequtils/__init__.pxd b/pyfastani/_sequtils/__init__.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..2ef1acee34702888c3b0f4f98117d044615bce87
--- /dev/null
+++ b/pyfastani/_sequtils/__init__.pxd
@@ -0,0 +1,4 @@
+cdef extern from "sequtils.h" nogil:
+    char complement(char base)
+    void copy_upper(char*, const char*, size_t)
+    void reverse_complement(char*, const char*, size_t)
diff --git a/pyfastani/_sequtils/complement.h b/pyfastani/_sequtils/complement.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a3ccb84ac93001e49ef4a41f5bba95bbae997b6
--- /dev/null
+++ b/pyfastani/_sequtils/complement.h
@@ -0,0 +1,28 @@
+#ifndef __SEQUTILS_COMPLEMENT_H
+#define __SEQUTILS_COMPLEMENT_H
+
+// efficient nucleotide complement with a lookup table
+static const char COMPLEMENT_LOOKUP[128] = {
+    '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
+    '\x08', '\t',   '\n',   '\x0', '\x0c', '\r',   '\x0e', '\x0f',
+    '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
+    '\x18', '\x19', '\x1a', '\x1', '\x1c', '\x1d', '\x1e', '\x1f',
+    ' ',    '!',    '"',    '#',    '$',    '%',    '&',    '\'',
+    '(',    ')',    '*',    '+',    ',',    '-',    '.',    '/',
+    '0',    '1',    '2',    '3',    '4',    '5',    '6',    '7',
+    '8',    '9',    ':',    ';',    '<',    '=',    '>',    '?',
+    '@',    'T',    'V',    'G',    'H',    'E',    'F',    'C',
+    'D',    'I',    'J',    'M',    'L',    'K',    'N',    'O',
+    'P',    'Q',    'Y',    'S',    'A',    'U',    'B',    'W',
+    'X',    'R',    'Z',    '[',    '\\',   ']',    '^',    '_',
+    '`',    't',    'v',    'g',    'h',    'e',    'f',    'c',
+    'd',    'i',    'j',    'm',    'l',    'k',    'n',    'o',
+    'p',    'q',    'y',    's',    'a',    'u',    'b',    'w',
+    'x',    'r',    'z',    '{',    '|',    '}',    '~',    '\x7f'
+};
+
+static char complement(char base) {
+    return COMPLEMENT_LOOKUP[(size_t) (base & 0x7F)];
+}
+
+#endif
diff --git a/pyfastani/_sequtils/sequtils.h b/pyfastani/_sequtils/sequtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bc1edfdceaa61692b4cb06a94b3a70f46f1dfa8
--- /dev/null
+++ b/pyfastani/_sequtils/sequtils.h
@@ -0,0 +1,29 @@
+#ifndef __SEQUTILS_H
+#define __SEQUTILS_H
+
+#include "complement.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void default_copy_upper(char*, const char*, size_t);
+#ifdef SSE2_BUILD_SUPPORTED
+void sse2_copy_upper(char*, const char*, size_t);
+#endif
+#ifdef NEON_BUILD_SUPPORTED
+void neon_copy_upper(char* dst, const char* src, size_t len);
+#endif
+void copy_upper(char*, const char*, size_t);
+
+void default_reverse_complement(char*, const char*, size_t);
+#ifdef SSSE3_BUILD_SUPPORTED
+extern void ssse3_reverse_complement(char* dst, const char* src, size_t len);
+#endif
+void reverse_complement(char*, const char*, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/pyfastani/_simd.c b/pyfastani/_simd.c
deleted file mode 100644
index 5114c65343d7ed4aade13164229149aea6c3d806..0000000000000000000000000000000000000000
--- a/pyfastani/_simd.c
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <ctype.h>
-#include <stddef.h>
-#ifdef __SSE2__
-#include <x86intrin.h>
-#endif
-#ifdef __ARM_NEON__
-#include <arm_neon.h>
-#endif
-
-#include "_simd.h"
-
-void default_copy_upper(char* dst, const char* src, size_t len) {
-    while (len-- > 0) {
-        *dst = toupper(*src);
-        src++;
-        dst++;
-    }
-}
-
-#ifdef __SSE2__
-void sse2_copy_upper(char* dst, const char* src, size_t len) {
-
-    const __m128i ascii_a = _mm_set1_epi8('a' - 1);
-    const __m128i ascii_z = _mm_set1_epi8('z');
-    const __m128i offset  = _mm_set1_epi8('a' - 'A');
-
-    while (len >= sizeof(__m128i)) {
-        __m128i inp = _mm_loadu_si128((__m128i*) src);
-        __m128i greater_than_a = _mm_cmpgt_epi8(inp, ascii_a);
-        __m128i less_equal_z = _mm_cmpgt_epi8(ascii_z, inp);
-        __m128i mask = _mm_and_si128(greater_than_a, less_equal_z);
-        __m128i diff = _mm_and_si128(mask, offset);
-        __m128i added = _mm_sub_epi8(inp, diff);
-        _mm_storeu_si128((__m128i *) dst, added);
-        len -= sizeof(__m128i);
-        src += sizeof(__m128i);
-        dst += sizeof(__m128i);
-    }
-
-    default_copy_upper(dst, src, len);
-}
-#endif
-#ifdef __ARM_NEON__
-void neon_copy_upper(char* dst, const char* src, size_t len) {
-
-    const int8x16_t ascii_a = vdupq_n_s8('a' - 1);
-    const int8x16_t ascii_z = vdupq_n_s8('z');
-    const int8x16_t offset  = vdupq_n_s8('a' - 'A');
-
-    while (len >= sizeof(int8x16_t)) {
-        int8x16_t inp = vld1q_u8((int8_t*) src);
-        int8x16_t greater_than_a = vcgtq_s8(inp, ascii_a);
-        int8x16_t less_equal_z = vcgtq_s8(ascii_z, inp);
-        int8x16_t mask = vandq_s8(greater_than_a, less_equal_z);
-        int8x16_t diff = vandq_s8(mask, offset);
-        int8x16_t added = vsubq_s8(inp, diff);
-        vst1q_s8((int8_t*) dst, added);
-        len -= sizeof(int8x16_t);
-        src += sizeof(int8x16_t);
-        dst += sizeof(int8x16_t);
-    }
-
-    default_copy_upper(dst, src, len);
-}
-#endif
-
-void (*resolve_copy_upper (void))(char*, const char*, size_t)
-{
-  // ifunc resolvers fire before constructors, explicitly call the init
-  // function.
-#ifdef __SSE2__
-  __builtin_cpu_init ();
-  if (__builtin_cpu_supports ("sse2"))
-    return sse2_copy_upper; // fast copying plus upper.
-  else
-#endif
-#ifdef __ARM_NEON__
-__builtin_cpu_init ();
-if (__builtin_cpu_supports ("neon"))
-  return neon_copy_upper; // fast copying plus upper.
-else
-#endif
-    return default_copy_upper;
-}
-
-void copy_upper(char*, const char*, size_t)
-     __attribute__ ((ifunc ("resolve_copy_upper")));
diff --git a/pyfastani/_simd.h b/pyfastani/_simd.h
deleted file mode 100644
index 1eeac380693c0b7dc81ad2daf87ab4c9d27641d6..0000000000000000000000000000000000000000
--- a/pyfastani/_simd.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __SIMD_H
-#define __SIMD_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void default_copy_upper(char*, const char*, size_t);
-#ifdef __SSE2__
-void sse_copy_upper(char*, const char*, size_t);
-#endif
-#ifdef __ARM_NEON__
-void neon_copy_upper(char* dst, const char* src, size_t len);
-#endif
-void copy_upper(char*, const char*, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/pyfastani/_simd.pxd b/pyfastani/_simd.pxd
deleted file mode 100644
index f545df43d38f279ff71b2e1b2802dfa82ef3580b..0000000000000000000000000000000000000000
--- a/pyfastani/_simd.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef extern from "_simd.h" nogil:
-    void copy_upper(char*, const char*, size_t)
diff --git a/pyfastani/_utils.cpp b/pyfastani/_utils.cpp
index af2065688a01fce3675f30d8b223d2b82390c13a..8170735b0b1bdbcdec9230636455af6b8b09ed9b 100644
--- a/pyfastani/_utils.cpp
+++ b/pyfastani/_utils.cpp
@@ -1,14 +1,6 @@
-#include "omp.h"
+#include <zlib.h>
 #include "_utils.hpp"
 
-int omp_get_thread_num(void) {
-    return 1; // Make the logger shut up.
-}
-
-int omp_get_num_threads(void) {
-    return 1;
-}
-
 ZEXTERN gzFile ZEXPORT gzdopen(int fd, const char* mode) {
     return NULL;
 }
diff --git a/pyfastani/_utils.hpp b/pyfastani/_utils.hpp
index 0c113608cfcff73cb74d795d46519b185ff4ac6f..6b9b317414e14e4a4af1f9a7fc70359c044e889b 100644
--- a/pyfastani/_utils.hpp
+++ b/pyfastani/_utils.hpp
@@ -14,34 +14,8 @@
 #include "map/include/winSketch.hpp"
 
 extern "C" {
-
     // compatibility layer for Cython
     typedef kseq_t* kseq_ptr_t;
-
-    // efficient nucleotide complement with a lookup table
-    static const char COMPLEMENT_LOOKUP[128] = {
-        '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
-        '\x08', '\t',   '\n',   '\x0', '\x0c', '\r',   '\x0e', '\x0f',
-        '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
-        '\x18', '\x19', '\x1a', '\x1', '\x1c', '\x1d', '\x1e', '\x1f',
-        ' ',    '!',    '"',    '#',    '$',    '%',    '&',    '\'',
-        '(',    ')',    '*',    '+',    ',',    '-',    '.',    '/',
-        '0',    '1',    '2',    '3',    '4',    '5',    '6',    '7',
-        '8',    '9',    ':',    ';',    '<',    '=',    '>',    '?',
-        '@',    'T',    'V',    'G',    'H',    'E',    'F',    'C',
-        'D',    'I',    'J',    'M',    'L',    'K',    'N',    'O',
-        'P',    'Q',    'Y',    'S',    'A',    'U',    'B',    'W',
-        'X',    'R',    'Z',    '[',    '\\',   ']',    '^',    '_',
-        '`',    't',    'v',    'g',    'h',    'e',    'f',    'c',
-        'd',    'i',    'j',    'm',    'l',    'k',    'n',    'o',
-        'p',    'q',    'y',    's',    'a',    'u',    'b',    'w',
-        'x',    'r',    'z',    '{',    '|',    '}',    '~',    '\x7f'
-    };
-
-    inline char complement(char base) {
-        return COMPLEMENT_LOOKUP[(size_t) (base & 0x7F)];
-    }
-
 }
 
-#endif // ifdef __UTILS_HPP
+#endif
diff --git a/pyfastani/_utils.pxd b/pyfastani/_utils.pxd
index a630ee114bb9c7ac76cf15dc98e69cd796a617a8..11500d8402a1dd1d7d70624d8deef37ff319edd0 100644
--- a/pyfastani/_utils.pxd
+++ b/pyfastani/_utils.pxd
@@ -24,20 +24,18 @@ cdef extern from "<iterator>" namespace "std" nogil:
     cdef ssize_t distance[I](I first, I last);
 
 
-cdef extern from "<zlib.h>" nogil:
-    cdef struct gzFile_s:
-        pass
-
-    ctypedef gzFile_s* gzFile
-
-    gzFile gzopen(int fd, const char* mode)
-    gzFile gzopen64(const char* path, const char* mode)
-    int gzread(gzFile file, void* buf, unsigned int len)
-    int gzclose(gzFile file)
+# cdef extern from "<zlib.h>" nogil:
+#     cdef struct gzFile_s:
+#         pass
+#
+#     ctypedef gzFile_s* gzFile
+#
+#     gzFile gzopen(int fd, const char* mode)
+#     gzFile gzopen64(const char* path, const char* mode)
+#     int gzread(gzFile file, void* buf, unsigned int len)
+#     int gzclose(gzFile file)
 
 
 cdef extern from "_utils.hpp" nogil:
 
     ctypedef kseq_t* kseq_ptr_t
-
-    int complement(int)
diff --git a/pyfastani/omp.cpp b/pyfastani/omp.cpp
index e670dd613d92b68c246869ddb9e59b50005d8f61..f6c68896fc6f483c2718c0b403f3ce734f4c1c0a 100644
--- a/pyfastani/omp.cpp
+++ b/pyfastani/omp.cpp
@@ -2,5 +2,10 @@
 
 // not needed anywhere except in `cgi::correctRefGenomeIds` so we can just
 // patch these functions to disable logging
-extern int omp_get_thread_num(void);
-extern int omp_get_num_threads(void);
+int omp_get_thread_num(void) {
+    return 1; // Make the logger shut up.
+}
+
+int omp_get_num_threads(void) {
+    return 1;
+}