From: "Michael R. Crusoe" <crusoe@debian.org>
Date: Thu, 27 Feb 2025 10:06:00 +0100
Subject: Enable building on non-x86

Forwarded: https://github.com/bwa-mem2/bwa-mem2/pull/84
---
 src/FMI_search.cpp |  4 ++++
 src/FMI_search.h   |  3 ++-
 src/bandedSWA.cpp  |  7 -------
 src/bandedSWA.h    |  9 ++++++++-
 src/bwa.h          |  7 +++++++
 src/bwamem.cpp     |  5 ++++-
 src/fastmap.cpp    | 10 +++++++++-
 src/ksw.cpp        |  1 -
 src/ksw.h          |  3 ++-
 src/kswv.h         |  3 ++-
 src/main.cpp       |  5 +++++
 src/runsimd.cpp    |  2 +-
 src/utils.h        | 38 ++++++++++++++++++++++++++++++++++++++
 13 files changed, 82 insertions(+), 15 deletions(-)

diff --git a/src/FMI_search.cpp b/src/FMI_search.cpp
index 5f0ff48..4609fde 100644
--- a/src/FMI_search.cpp
+++ b/src/FMI_search.cpp
@@ -29,10 +29,14 @@ Authors: Sanchit Misra <sanchit.misra@intel.com>; Vasimuddin Md <vasimuddin.md@i
 
 #include <stdio.h>
 #include "sais.h"
+#if !(defined(__GNUC__) && __GNUC__ < 11 && !defined(__clang__))
+#include <immintrin.h>  // For __rdtsc
+#endif
 #include "FMI_search.h"
 #include "memcpy_bwamem.h"
 #include "profiling.h"
 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/FMI_search.h b/src/FMI_search.h
index 25c4d0d..92ac03f 100644
--- a/src/FMI_search.h
+++ b/src/FMI_search.h
@@ -34,7 +34,8 @@ Authors: Sanchit Misra <sanchit.misra@intel.com>; Vasimuddin Md <vasimuddin.md@i
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-#include <immintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx2.h>
 #include <limits.h>
 #include <fstream>
 
diff --git a/src/bandedSWA.cpp b/src/bandedSWA.cpp
index dfd81bc..c99e072 100644
--- a/src/bandedSWA.cpp
+++ b/src/bandedSWA.cpp
@@ -4148,13 +4148,6 @@ void BandedPairWiseSW::smithWaterman128_16(uint16_t seq1SoA[],
 
 /********************************************************************************/
 /* SSE2 - 8 bit version */
-#ifndef __SSE4_1__
-static inline __m128i _mm_blendv_epi8 (__m128i x, __m128i y, __m128i mask)
-{
-    // Replace bit in x with bit in y when matching bit in mask is set:
-    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(mask, y));
-}
-#endif
 
 #define ZSCORE8(i4_128, y4_128)                                         \
     {                                                                   \
diff --git a/src/bandedSWA.h b/src/bandedSWA.h
index c81552b..1a9fd1f 100644
--- a/src/bandedSWA.h
+++ b/src/bandedSWA.h
@@ -36,10 +36,17 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
 #include <assert.h>
 #include "macro.h"
 
+#if !defined(__SSE__)
+#define _mm_malloc(size, align) aligned_alloc(align, size)
+#define _mm_free free
+#define _MM_HINT_NTA 0
+#endif
+
+#define SIMDE_ENABLE_NATIVE_ALIASES
 #if (__AVX512BW__ || __AVX2__)
 #include <immintrin.h>
 #else
-#include <smmintrin.h>  // for SSE4.1
+#include <simde/x86/sse4.1.h>  // for SSE4.1
 #define __mmask8 uint8_t
 #define __mmask16 uint16_t
 #endif
diff --git a/src/bwa.h b/src/bwa.h
index 877f00c..aa36083 100644
--- a/src/bwa.h
+++ b/src/bwa.h
@@ -37,6 +37,13 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
 #include "bwt.h"
 #include "macro.h"
 
+#if !defined(__SSE__)
+#define _mm_malloc(size, align) aligned_alloc(align, size)
+#define _mm_free free
+#define _MM_HINT_NTA 0
+#define _MM_HINT_T0 0
+#endif
+
 #define BWA_IDX_BWT 0x1
 #define BWA_IDX_BNS 0x2
 #define BWA_IDX_PAC 0x4
diff --git a/src/bwamem.cpp b/src/bwamem.cpp
index cfaa6ec..3f05377 100755
--- a/src/bwamem.cpp
+++ b/src/bwamem.cpp
@@ -28,6 +28,9 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
          Heng Li <hli@jimmy.harvard.edu>
 *****************************************************************************************/
 
+#if !(defined(__GNUC__) && __GNUC__ < 11 && !defined(__clang__))
+#include <immintrin.h>  // For __rdtsc
+#endif
 #include "bwamem.h"
 #include "FMI_search.h"
 #include "memcpy_bwamem.h"
@@ -2023,7 +2026,7 @@ inline void sortPairsLen(SeqPair *pairArray, int32_t count, SeqPair *tempArray,
 {
 
     int32_t i;
-#if ((!__AVX512BW__) & (__AVX2__ | __SSE2__))
+#if (!__AVX512BW__)
     for(i = 0; i <= MAX_SEQ_LEN16; i++) hist[i] = 0;
 #else
     __m512i zero512 = _mm512_setzero_si512();
diff --git a/src/fastmap.cpp b/src/fastmap.cpp
index 5cbb2dc..1318f90 100644
--- a/src/fastmap.cpp
+++ b/src/fastmap.cpp
@@ -36,9 +36,13 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
 #include <numa.h>
 #endif
 #include <sstream>
+#if !(defined(__GNUC__) && __GNUC__ < 11 && !defined(__clang__))
+#include <immintrin.h>  // For __rdtsc
+#endif
 #include "fastmap.h"
 #include "FMI_search.h"
 
+
 #if AFF && (__linux__)
 #include <sys/sysinfo.h>
 int affy[256];
@@ -52,7 +56,7 @@ void __cpuid(unsigned int i, unsigned int cpuid[4]) {
 #ifdef _WIN32
     __cpuid((int *) cpuid, (int)i);
 
-#else
+#elif defined(__x86_64__) || defined(__i386__)
     asm volatile
         ("cpuid" : "=a" (cpuid[0]), "=b" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
             : "0" (i), "2" (0));
@@ -62,6 +66,7 @@ void __cpuid(unsigned int i, unsigned int cpuid[4]) {
 
 int HTStatus()
 {
+#if defined(__x86_64__) || defined(__i386__)
     unsigned int cpuid[4];
     char platform_vendor[12];
     __cpuid(0, cpuid);
@@ -93,6 +98,9 @@ int HTStatus()
         fprintf(stderr, "CPUs support hyperThreading !!\n");
 
     return ht;
+#else
+    return 0;
+#endif
 }
 
 
diff --git a/src/ksw.cpp b/src/ksw.cpp
index ad9bc50..9369713 100644
--- a/src/ksw.cpp
+++ b/src/ksw.cpp
@@ -30,7 +30,6 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <assert.h>
-#include <emmintrin.h>
 #include "ksw.h"
 #include "macro.h"
 
diff --git a/src/ksw.h b/src/ksw.h
index 54bcef8..3179fc4 100644
--- a/src/ksw.h
+++ b/src/ksw.h
@@ -26,7 +26,8 @@
 #define __AC_KSW_H
 
 #include <stdint.h>
-#include <emmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse2.h>
 
 #define KSW_XBYTE  0x10000
 #define KSW_XSTOP  0x20000
diff --git a/src/kswv.h b/src/kswv.h
index 11da4d7..aed1d1a 100644
--- a/src/kswv.h
+++ b/src/kswv.h
@@ -39,7 +39,8 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
 #include "ksw.h"
 #include "bandedSWA.h"
 #else
-#include <immintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx2.h>
 #endif
 
 #ifdef __GNUC__
diff --git a/src/main.cpp b/src/main.cpp
index abee98d..0b4f07f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -29,6 +29,9 @@ Contacts: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@
 *****************************************************************************************/
 
 // ----------------------------------
+#if !(defined(__GNUC__) && __GNUC__ < 11 && !defined(__clang__))
+#include <immintrin.h>  // For __rdtsc
+#endif
 #include "main.h"
 
 #ifndef PACKAGE_VERSION
@@ -85,6 +88,8 @@ int main(int argc, char* argv[])
         fprintf(stderr, "Executing in SSE4.2 mode!!\n");
 #elif __SSE4_1__
         fprintf(stderr, "Executing in SSE4.1 mode!!\n");        
+#elif __SSE2__
+        fprintf(stderr, "Executing in SSE2!!\n");        
 #endif
         fprintf(stderr, "-----------------------------\n");
 
diff --git a/src/runsimd.cpp b/src/runsimd.cpp
index 03c0e91..e0798f2 100644
--- a/src/runsimd.cpp
+++ b/src/runsimd.cpp
@@ -61,7 +61,7 @@ void __cpuidex(int cpuid[4], int func_id, int subfunc_id)
 	__asm__ volatile ("cpuid"
 			: "=a" (cpuid[0]), "=b" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
 			: "0" (func_id), "2" (subfunc_id));
-#else // on 32bit, ebx can NOT be used as PIC code
+#elif defined(__i386__) // on 32bit, ebx can NOT be used as PIC code
 	__asm__ volatile ("xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1"
 			: "=a" (cpuid[0]), "=r" (cpuid[1]), "=c" (cpuid[2]), "=d" (cpuid[3])
 			: "0" (func_id), "2" (subfunc_id));
diff --git a/src/utils.h b/src/utils.h
index fbf8439..f07fee2 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -30,6 +30,7 @@
 
 #include <stdint.h>
 #include <stdio.h>
+#include <sys/select.h>
 #include <zlib.h>
 
 #ifdef __GNUC__
@@ -64,6 +65,43 @@ static inline unsigned long long __rdtsc(void)
     return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
 }
 #endif
+// From https://github.com/google/benchmark/blob/37177a84b7e8d33696ea1e1854513cb0de3b4dc3/src/cycleclock.h
+// Apache 2.0 license
+#elif defined(__aarch64__)
+  // System timer of ARMv8 runs at a different frequency than the CPU's.
+  // The frequency is fixed, typically in the range 1-50MHz.  It can be
+  // read at CNTFRQ special register.  We assume the OS has set up
+  // the virtual timer properly.
+static inline unsigned long long __rdtsc(void)
+{
+  int64_t virtual_timer_value;
+  asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
+  return virtual_timer_value;
+}
+#elif !(defined(__GNUC__) && __GNUC__ >= 11)
+static inline unsigned long long __rdtsc(void)
+{
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6)
+  // V6 is the earliest arch that has a standard cyclecount
+  // Native Client validator doesn't allow MRC instructions.
+  uint32_t pmccntr;
+  uint32_t pmuseren;
+  uint32_t pmcntenset;
+  // Read the user mode perf monitor counter access permissions.
+  asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+  if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
+    asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+    if (pmcntenset & 0x80000000ul) {  // Is it counting?
+      asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+      // The counter is set up to count every 64th cycle
+      return static_cast<int64_t>(pmccntr) * 64;  // Should optimize to << 6
+    }
+  }
+#endif
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
 #endif
 
 typedef struct {
