src/corelib/tools/qsimd.cpp
changeset 37 758a864f9613
parent 33 3e2da88830cd
--- a/src/corelib/tools/qsimd.cpp	Fri Sep 17 08:34:18 2010 +0300
+++ b/src/corelib/tools/qsimd.cpp	Mon Oct 04 01:19:32 2010 +0300
@@ -41,20 +41,42 @@
 
 #include "qsimd_p.h"
 #include <QByteArray>
+#include <stdio.h>
 
 #if defined(Q_OS_WINCE)
 #include <windows.h>
 #endif
 
+#if defined(Q_OS_WIN64) && !defined(Q_CC_GNU)
+#include <intrin.h>
+#endif
+
+#if defined(Q_OS_LINUX) && defined(__arm__)
+#include "private/qcore_unix_p.h"
+
+// the kernel header definitions for HWCAP_*
+// (the ones we need/may need anyway)
+
+// copied from <asm/hwcap.h> (ARM)
+#define HWCAP_IWMMXT    512
+#define HWCAP_CRUNCH    1024
+#define HWCAP_THUMBEE   2048
+#define HWCAP_NEON      4096
+#define HWCAP_VFPv3     8192
+#define HWCAP_VFPv3D16  16384
+
+// copied from <linux/auxvec.h>
+#define AT_HWCAP  16    /* arch dependent hints at CPU capabilities */
+
+#endif
+
 QT_BEGIN_NAMESPACE
 
-uint qDetectCPUFeatures()
+#if defined (Q_OS_WINCE)
+static inline uint detectProcessorFeatures()
 {
-    static uint features = 0xffffffff;
-    if (features != 0xffffffff)
-        return features;
+    uint features = 0;
 
-#if defined (Q_OS_WINCE)
 #if defined (ARM)
     if (IsProcessorFeaturePresent(PF_ARM_INTEL_WMMX)) {
         features = IWMMXT;
@@ -74,75 +96,98 @@
 #endif
     features = 0;
     return features;
-#elif defined(QT_HAVE_IWMMXT)
+}
+
+#elif defined(__arm__) || defined(__arm) || defined(QT_HAVE_IWMMXT) || defined(QT_HAVE_NEON)
+static inline uint detectProcessorFeatures()
+{
+    uint features = 0;
+
+#if defined(Q_OS_LINUX)
+    int auxv = ::qt_safe_open("/proc/self/auxv", O_RDONLY);
+    if (auxv != -1) {
+        unsigned long vector[64];
+        int nread;
+        while (features == 0) {
+            nread = ::qt_safe_read(auxv, (char *)vector, sizeof vector);
+            if (nread <= 0) {
+                // EOF or error
+                break;
+            }
+
+            int max = nread / (sizeof vector[0]);
+            for (int i = 0; i < max; i += 2)
+                if (vector[i] == AT_HWCAP) {
+                    if (vector[i+1] & HWCAP_IWMMXT)
+                        features |= IWMMXT;
+                    if (vector[i+1] & HWCAP_NEON)
+                        features |= NEON;
+                    break;
+                }
+        }
+
+        ::qt_safe_close(auxv);
+        return features;
+    }
+    // fall back if /proc/self/auxv wasn't found
+#endif
+
+#if defined(QT_HAVE_IWMMXT)
     // runtime detection only available when running as a previlegied process
-    static const bool doIWMMXT = !qgetenv("QT_NO_IWMMXT").toInt();
-    features = doIWMMXT ? IWMMXT : 0;
-    return features;
+    features = IWMMXT;
 #elif defined(QT_HAVE_NEON)
-    static const bool doNEON = !qgetenv("QT_NO_NEON").toInt();
-    features = doNEON ? NEON : 0;
+    features = NEON;
+#endif
+
     return features;
-#else
-    features = 0;
-#if defined(__x86_64__) || defined(Q_OS_WIN64)
-    features = MMX|SSE|SSE2|CMOV;
-#elif defined(__ia64__)
-    features = MMX|SSE|SSE2;
+}
+
 #elif defined(__i386__) || defined(_M_IX86)
+static inline uint detectProcessorFeatures()
+{
+    uint features = 0;
+
     unsigned int extended_result = 0;
+    unsigned int feature_result = 0;
     uint result = 0;
     /* see p. 118 of amd64 instruction set manual Vol3 */
 #if defined(Q_CC_GNU)
-    asm ("push %%ebx\n"
-         "pushf\n"
-         "pop %%eax\n"
-         "mov %%eax, %%ebx\n"
-         "xor $0x00200000, %%eax\n"
-         "push %%eax\n"
+    long cpuid_supported, tmp1;
+    asm ("pushf\n"
+         "pop %0\n"
+         "mov %0, %1\n"
+         "xor $0x00200000, %0\n"
+         "push %0\n"
          "popf\n"
          "pushf\n"
-         "pop %%eax\n"
-         "xor %%edx, %%edx\n"
-         "xor %%ebx, %%eax\n"
-         "jz 1f\n"
-
-         "mov $0x00000001, %%eax\n"
-         "cpuid\n"
-         "1:\n"
-         "pop %%ebx\n"
-         "mov %%edx, %0\n"
-        : "=r" (result)
-        :
-        : "%eax", "%ecx", "%edx"
-        );
+         "pop %0\n"
+         "xor %1, %0\n" // %eax is now 0 if CPUID is not supported
+         : "=a" (cpuid_supported), "=r" (tmp1)
+         );
+    if (cpuid_supported) {
+        asm ("xchg %%ebx, %2\n"
+             "cpuid\n"
+             "xchg %%ebx, %2\n"
+            : "=c" (feature_result), "=d" (result), "=&r" (tmp1)
+            : "a" (1));
 
-    asm ("push %%ebx\n"
-         "pushf\n"
-         "pop %%eax\n"
-         "mov %%eax, %%ebx\n"
-         "xor $0x00200000, %%eax\n"
-         "push %%eax\n"
-         "popf\n"
-         "pushf\n"
-         "pop %%eax\n"
-         "xor %%edx, %%edx\n"
-         "xor %%ebx, %%eax\n"
-         "jz 2f\n"
+        asm ("xchg %%ebx, %1\n"
+             "cpuid\n"
+             "cmp $0x80000000, %%eax\n"
+             "jnbe 1f\n"
+             "xor %0, %0\n"
+             "jmp 2f\n"
+             "1:\n"
+             "mov $0x80000001, %%eax\n"
+             "cpuid\n"
+             "2:\n"
+             "xchg %%ebx, %1\n"
+            : "=d" (extended_result), "=&r" (tmp1)
+            : "a" (0x80000000)
+            : "%ecx"
+            );
+    }
 
-         "mov $0x80000000, %%eax\n"
-         "cpuid\n"
-         "cmp $0x80000000, %%eax\n"
-         "jbe 2f\n"
-         "mov $0x80000001, %%eax\n"
-         "cpuid\n"
-         "2:\n"
-         "pop %%ebx\n"
-         "mov %%edx, %0\n"
-        : "=r" (extended_result)
-        :
-        : "%eax", "%ecx", "%edx"
-        );
 #elif defined (Q_OS_WIN)
     _asm {
         push eax
@@ -164,6 +209,7 @@
         mov eax, 1
         cpuid
         mov result, edx
+        mov feature_result, ecx
     skip:
         pop edx
         pop ecx
@@ -203,6 +249,7 @@
     }
 #endif
 
+
     // result now contains the standard feature bits
     if (result & (1u << 15))
         features |= CMOV;
@@ -218,44 +265,141 @@
         features |= SSE;
     if (result & (1u << 26))
         features |= SSE2;
-    if (extended_result & (1u))
+    if (feature_result & (1u))
         features |= SSE3;
-    if (extended_result & (1u << 9))
+    if (feature_result & (1u << 9))
         features |= SSSE3;
-    if (extended_result & (1u << 19))
+    if (feature_result & (1u << 19))
         features |= SSE4_1;
-    if (extended_result & (1u << 20))
+    if (feature_result & (1u << 20))
         features |= SSE4_2;
-    if (extended_result & (1u << 28))
+    if (feature_result & (1u << 28))
+        features |= AVX;
+
+    return features;
+}
+
+#elif defined(__x86_64) || defined(Q_OS_WIN64)
+static inline uint detectProcessorFeatures()
+{
+    uint features = MMX|SSE|SSE2|CMOV;
+    uint feature_result = 0;
+
+#if defined(Q_CC_GNU)
+    asm ("cpuid"
+        : "=c" (feature_result)
+        : "a" (1)
+        : "%ebx", "%edx"
+        );
+#elif defined (Q_OS_WIN64)
+    {
+       int info[4];
+       __cpuid(info, 1);
+       feature_result = info[2];
+    }
+#endif
+
+    if (feature_result & (1u))
+        features |= SSE3;
+    if (feature_result & (1u << 9))
+        features |= SSSE3;
+    if (feature_result & (1u << 19))
+        features |= SSE4_1;
+    if (feature_result & (1u << 20))
+        features |= SSE4_2;
+    if (feature_result & (1u << 28))
         features |= AVX;
 
-#endif // i386
-
-#if defined(QT_HAVE_MMX)
-    if (qgetenv("QT_NO_MMX").toInt())
-        features ^= MMX;
-#endif
-    if (qgetenv("QT_NO_MMXEXT").toInt())
-        features ^= MMXEXT;
+    return features;
+}
 
-#if defined(QT_HAVE_3DNOW)
-    if (qgetenv("QT_NO_3DNOW").toInt())
-        features ^= MMX3DNOW;
-#endif
-    if (qgetenv("QT_NO_3DNOWEXT").toInt())
-        features ^= MMX3DNOWEXT;
+#elif defined(__ia64__)
+static inline uint detectProcessorFeatures()
+{
+    return MMX|SSE|SSE2;
+}
 
-#if defined(QT_HAVE_SSE)
-    if (qgetenv("QT_NO_SSE").toInt())
-        features ^= SSE;
-#endif
-#if defined(QT_HAVE_SSE2)
-    if (qgetenv("QT_NO_SSE2").toInt())
-        features ^= SSE2;
+#else
+static inline uint detectProcessorFeatures()
+{
+    return 0;
+}
 #endif
 
+/*
+ * Use kdesdk/scripts/generate_string_table.pl to update the table below.
+ * Here's the data (don't forget the ONE leading space):
+ mmx
+ mmxext
+ mmx3dnow
+ mmx3dnowext
+ sse
+ sse2
+ cmov
+ iwmmxt
+ neon
+ sse3
+ ssse3
+ sse4.1
+ sse4.2
+ avx
+  */
+
+// begin generated
+static const char features_string[] =
+    " mmx\0"
+    " mmxext\0"
+    " mmx3dnow\0"
+    " mmx3dnowext\0"
+    " sse\0"
+    " sse2\0"
+    " cmov\0"
+    " iwmmxt\0"
+    " neon\0"
+    " sse3\0"
+    " ssse3\0"
+    " sse4.1\0"
+    " sse4.2\0"
+    " avx\0"
+    "\0";
+
+static const int features_indices[] = {
+       0,    5,   13,   23,   36,   41,   47,   53,
+      61,   67,   73,   80,   88,   96,   -1
+};
+// end generated
+
+const int features_count = (sizeof features_indices - 1) / (sizeof features_indices[0]);
+
+uint qDetectCPUFeatures()
+{
+    static QBasicAtomicInt features = Q_BASIC_ATOMIC_INITIALIZER(-1);
+    if (features != -1)
+        return features;
+
+    uint f = detectProcessorFeatures();
+    QByteArray disable = qgetenv("QT_NO_CPU_FEATURE");
+    if (!disable.isEmpty()) {
+        disable.prepend(' ');
+        for (int i = 0; i < features_count; ++i) {
+            if (disable.contains(features_string + features_indices[i]))
+                f &= ~(1 << i);
+        }
+    }
+
+    features = f;
     return features;
-#endif
+}
+
+void qDumpCPUFeatures()
+{
+    uint features = qDetectCPUFeatures();
+    printf("Processor features: ");
+    for (int i = 0; i < features_count; ++i) {
+        if (features & (1 << i))
+            printf("%s", features_string + features_indices[i]);
+    }
+    puts("");
 }
 
 QT_END_NAMESPACE