/*
 * Copyright 2020 The Emscripten Authors.  All rights reserved.
 * Emscripten is available under two separate licenses, the MIT license and the
 * University of Illinois/NCSA Open Source License.  Both these licenses can be
 * found in the LICENSE file.
 */
// This file runs a handful of neon instructions with some test values.

#include <cassert>
#include <stdio.h>
#include <string.h>
#include <arm_neon.h>

static int
test_simde_vaddq_s32 () {
        struct {
                int32_t a[4];
                int32_t b[4];
                int32_t r[4];
        } test_vec[] = {
                        { { -446537815, -222544457,  1358712669,  1039645779 },
                          { -1526568976,  1435568682,  1254287845,  1972851916 },
                          { -1973106791,  1213024225, -1681966782, -1282469601 } },
                        { {  1801189811, -1805838537, -1176151706,  116907030 },
                          {  1856764227, -389858301, -231569882, -1922577958 },
                          { -637013258,  2099270458, -1407721588, -1805670928 } },
                        { { -84360509,  1083070169, -973507665, -1815351216 },
                          { -335448343, -640367181,  634062411, -1817038128 },
                          { -419808852,  442702988, -339445254,  662577952 } },
                        { { -829576203, -1341252863, -545912689, -495828488 },
                          {  1808692408, -2075876551,  2041122729,  906779457 },
                          {  979116205,  877837882,  1495210040,  410950969 } },
                        { {  117807366,  1152914357,  337849883,  670471535 },
                          {  1116914697,  298243687,  663384037, -782395445 },
                          {  1234722063,  1451158044,  1001233920, -111923910 } },
                        { { -438738384, -1859546762,  782585023, -346645534 },
                          { -953227168, -1529285441,  801858404,  704653818 },
                          { -1391965552,  906135093,  1584443427,  358008284 } },
                        { {  17881483,  697514346,  1750612102,  877899476 },
                          {  1442546070, -621095818,  822723895,  2052786670 },
                          {  1460427553,  76418528, -1721631299, -1364281150 } },
                        { {  1299934179,  729222821,  462671687,  324003453 },
                          { -563525016,  1438147358,  394707240, -1382948150 },
                          {  736409163, -2127597117,  857378927, -1058944697 } }
        };

        for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
                int32x4_t a = vld1q_s32(test_vec[i].a);
                int32x4_t b = vld1q_s32(test_vec[i].b);
                int32x4_t r = vaddq_s32(a, b);
                int32_t r_[4];
                vst1q_s32(r_, r);
                assert(memcmp(r_, test_vec[i].r, sizeof(int32_t) * 4) == 0);
        }

        return 0;
}

static int
test_simde_vsubq_s16 () {
        struct {
                int16_t a[8];
                int16_t b[8];
                int16_t r[8];
        } test_vec[] = {
                        { {  31318,  4134, -6723,  21335, -25763,  17298, -9065,  1079 },
                          {  19197,  1228, -7522, -5302, -1743, -13071,  8924,  13041 },
                          {  12121,  2906,  799,  26637, -24020,  30369, -17989, -11962 } },
                        { {  6045,  23106, -25859,  23213,  16437, -13155, -10980,  6608 },
                          { -25569, -17123,  26495, -20312, -26016,  15740,  28348,  22895 },
                          {  31614, -25307,  13182, -22011, -23083, -28895,  26208, -16287 } },
                        { { -19835, -32077,  24908, -32291,  31393, -17075,  7503,  28374 },
                          { -3143,  14379, -11174, -17431,  25966,  11000,  26579,  22916 },
                          { -16692,  19080, -29454, -14860,  5427, -28075, -19076,  5458 } },
                        { {  14105,  26075, -18280,  14822,  13107, -32010, -13232,  2801 },
                          {  7359,  6722,  11248,  24277, -12911,  25737,  3380,  20157 },
                          {  6746,  19353, -29528, -9455,  26018,  7789, -16612, -17356 } },
                        { { -26300, -8781, -26031, -31722,  3533,  7687, -1831, -26328 },
                          {  27156,  1459, -30570,  10083, -5035, -30325,  18937,  16087 },
                          {  12080, -10240,  4539,  23731,  8568, -27524, -20768,  23121 } },
                        { { -29726,  13083,  12581, -3400, -16578,  6160,  14519, -13391 },
                          {  25763,  14800,  13548,  16736, -5344,  6858, -24012,  5720 },
                          {  10047, -1717, -967, -20136, -11234, -698, -27005, -19111 } },
                        { {  29485,  21066,  676, -7356,  21953,  30971, -21363,  12355 },
                          {  5136, -919, -14008,  26685,  1973, -5758, -9559, -10752 },
                          {  24349,  21985,  14684,  31495,  19980, -28807, -11804,  23107 } },
                        { {  19021, -3544,  27980,  3541, -12094,  20357, -14212, -29568 },
                          { -5668,  9352, -14925,  26765,  4044,  30289,  20970,  14156 },
                          {  24689, -12896, -22631, -23224, -16138, -9932,  30354,  21812 } }
        };

        for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
                int16x8_t a = vld1q_s16(test_vec[i].a);
                int16x8_t b = vld1q_s16(test_vec[i].b);
                int16x8_t r = vsubq_s16(a, b);

                int16_t r_[8];
                vst1q_s16(r_, r);
                assert(memcmp(r_, test_vec[i].r, sizeof(int16_t) * 8) == 0);
        }
        return 0;
}

static int
test_simde_vmulq_u32 () {
        struct {
                uint32_t a[4];
                uint32_t b[4];
                uint32_t r[4];
        } test_vec[] = {
                        { { 838207622, 3405383108, 2377386269, 4159190503 },
                          { 2224025479, 3989376773, 3953058906, 1658203612 },
                          { 3851262122, 2552010964, 1750009906, 1512172932 } },
                        { { 4187212853, 4005793489, 2138863511, 880174253 },
                          { 1505297747, 1850114580, 1113187430, 1185165073 },
                          { 3442818607, 3505605204, 664347178, 86798973 } },
                        { { 3426695163, 1572537029, 668743546, 4015739547 },
                          { 1799885911, 4242116246, 2201891698, 1590288994 },
                          { 24355405, 3961120110, 2091516500, 93680982 } },
                        { { 3744073754, 2218583306, 3098220572, 3248949098 },
                          { 2989289243, 4021159549, 2608000313, 3925425103 },
                          { 1064771774, 31855074, 870468156, 3302911158 } },
                        { { 1305027651, 651298057, 2296282398, 2689238404 },
                          { 4048713332, 3051356284, 3176223469, 3500558989 },
                          { 2790346844, 2459594844, 3062591686, 2805707188 } },
                        { { 2015194990, 2476666741, 4045110381, 1972462849 },
                          { 1483138012, 3507373796, 630087576, 418723241 },
                          { 2297318536, 2355970612, 552883128, 2027545257 } },
                        { { 428872356, 1873554946, 2875246762, 153154093 },
                          { 3110242005, 1686859980, 2022250959, 4086333006 },
                          { 1432967796, 342591896, 433380214, 4018925494 } },
                        { { 2450268304, 4160862542, 2929943169, 699974484 },
                          { 367204937, 1501130378, 3604021895, 298410624 },
                          { 2423515408, 577464332, 2688480263, 2283524608 } },
        };

        for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
                uint32x4_t a = vld1q_u32(test_vec[i].a);
                uint32x4_t b = vld1q_u32(test_vec[i].b);
                uint32x4_t r = vmulq_u32(a, b);
                uint32_t r_[4];
                vst1q_u32(r_, r);
                assert(memcmp(r_, test_vec[i].r, sizeof(uint32_t) * 4) == 0);
        }
        return 0;
}

static int
test_simde_vbslq_s8 () {
        struct {
                uint8_t a[16];
                int8_t b[16];
                int8_t c[16];
                int8_t r[16];
        } test_vec[] = {
                        { { 136, 230, 97, 0, 20, 50, 138, 231,
                            UINT8_MAX, 18, 190, 185, 181, 25, 53, 6 },
                          {  60, -30,  93, -89, -3,  121,  83, -123,
                             102, -115, -97, -39, -104,  82, -24,  32 },
                          {  56,  73,  32,  76,  123, -85,  51,  122,
                             -67, -15,  51,  114,  11,  104,  120,  71 },
                          {  56, -21,  65,  76,      INT8_MAX, -71,  51, -99,
                             102, -31, -97, -37, -102,  112,  104,  65 } },
                        { { 74, 214, 238, 71, 79, 65, 204, 181,
                            207, 107, 142, 103, 189, 118, 136, 245 },
                          { -65, -88,  65,  58,  83,  117, -76,  17,
                            102, -25, -125,  113,  79, -4, -72, -103 },
                          { -46, -90, -32,  33, -24, -84, -42, -73,
                            23,  100,  30, -44, -37, -90, -55, -102 },
                          { -102, -96,  64,  34, -29, -19, -106,  19,
                            86,  103, -110, -15,  79, -12, -55, -101 } },
                        { { 79, 11, 213, 162, 128, 137, 179, 230,
                            113, 55, 88, 192, 51, 16, 90, 5 },
                          { -73,  58,  38, -97, -25, -4,  86, -2,
                            96,  116, -45,  59,  27, -100, -42,  106 },
                          { -89, -85,  12,  39,  52, -64,  14, -91,
                            -9,  102,  102,  42,  118, -64,  47,  45 },
                          { -89, -86,  12, -121, -76, -56,  30, -25,
                            -26,  116,  118,  42,  87, -48,  119,  40 } },
                        { { 250, 85, 204, 225, 81, 34, 224, 177,
                            151, 179, 237, 178, 79, 195, 28, 247 },
                          {  110,  40,  30, -94, -24,  44,  72, -33,
                             -110, -82,  9,  9,  110,  56,  54,  104 },
                          { -115,  3,  74, -34,  37,  42, -112, -68,
                            -35,  125,  110,  44,  64, -118,  35, -82 },
                          {  111,  2,  14, -66,  100,  40,  80, -99,
                             -38, -18,  11,  12,  78,  8,  55,  104 } },
                        { { 179, 66, 80, 155, 110, 152, 123, 1,
                            70, 132, 10, 180, 189, 64, 29, 74 },
                          {  67,  103,  41,  105, -111, -71,  37,  110,
                             54, -108, -102,  118,  30, -66,  36, -47 },
                          {  0,  116,  109,  110,  13, -24,  111,  83,
                             108,  121,  8,  41, -70,  37,  116, -3 },
                          {  3,  118,  45,  109,  1, -8,  37,  82,
                             46, -3,  10,  61,  30,  37,  100, -11 } },
                        { { 140, 157, 102, 29, 86, 140, 139, 140,
                            32, 37, 2, 62, 227, 38, 16, 227 },
                          { -102,  125,  82, -89,  101, -63, -5, -47,
                            59,  3, -5, -11,  40,  111, -14, -76 },
                          {  12,  89, -47,  98, -27,  92, -18,  5,
                             -127, -16,  67,  101,  22,  83,  72, -80 },
                          { -120,  93, -45,  103, -27, -48, -17, -127,
                            -95, -47,  67,  117,  52,  119,  88, -80 } },
                        { { 208, 154, 88, 53, 92, 83, 7, 151,
                            86, 2, 140, 126, 113, 126, 50, 125 },
                          { -41,  3, -33, -68,  95, -51, -63, -32,
                            -67,  5,  69, -45,  88, -114, -125,  41 },
                          {  40, -37,  94, -124,  46,  101,  27, -124,
                             103, -89,  2, -40,  38,  52,  85, -3 },
                          { -8,  67,  94, -76,  126,  101,  25,      INT8_MIN,
                            53, -91,  6, -46,  86,  14,  71, -87 } },
                        { { 55, 52, 186, 150, 1, 123, 119, 190,
                            128, 188, 145, 217, 74, 21, 2, 115 },
                          { -16,  96, -9,  31, -58,  19, -93,  45,
                            -70, -90,  6, -32, -38,  91, -34,  18 },
                          { -112, -104, -88, -111,  19,  31,  80, -108,
                            -36, -31,  109,  38, -10,  111, -103, -25 },
                          { -80, -88, -78,  23,  18,  23,  35,  44,
                            -36, -27,  108, -26, -2,  123, -101, -106 } },
        };

        for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
                uint8x16_t a = vld1q_u8(test_vec[i].a);
                int8x16_t b = vld1q_s8(test_vec[i].b);
                int8x16_t c = vld1q_s8(test_vec[i].c);
                int8x16_t r = vbslq_s8(a, b, c);

                int8_t r_[16];
                vst1q_s8(r_, r);
                assert(memcmp(r_, test_vec[i].r, sizeof(int8_t) * 16) == 0);

        }

        return 0;
}

static int
test_simde_vshl_s64 () {
  struct {
    int64_t a[1];
    int64_t b[1];
    int64_t r[1];
  } test_vec[] = {
    { { -4131760704340726406 },
      { -51 },
      { -1835 } },
    { {  4795347804945835666 },
      { -8617733424538507080 },
      {  0 } },
    { { -2987477723010756817 },
      { -49 },
      { -5307 } },
    { { -834069088546172233 },
      {  62 },
      { -4611686018427387904 } },
    { { -3249651920531950297 },
      {  2418075451758470935 },
      {  7338760155284111360 } },
    { {  5243402359250859729 },
      { -47 },
      {  37256 } },
    { { -5953442574285360237 },
      {  33 },
      { -1003907829020491776 } },
    { {  7218011237552599699 },
      { -36 },
      {  105035887 } },
  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    int64x1_t a = vld1_s64(test_vec[i].a);
    int64x1_t b = vld1_s64(test_vec[i].b);
    int64x1_t r = vshl_s64(a, b);
    int64_t r_[1];
    vst1_s64(r_, r);
    assert(memcmp(r_, test_vec[i].r, sizeof(int64_t) * 1) == 0);
  }

  return 0;
}

static int
test_simde_vdotq_s32 () {
  static const struct {
    int32_t a[4];
    int8_t b[16];
    int8_t c[16];
    int32_t r[4];
  } test_vec[] = {
    { { -INT32_C(  1315274814),  INT32_C(  1813013239), -INT32_C(   878005830),  INT32_C(   996395424) },
      { -INT8_C(  46), -INT8_C(  66),  INT8_C(  35), -INT8_C( 112), -INT8_C(  26), -INT8_C(  23), -INT8_C(  99), -INT8_C(  11),
        -INT8_C(  89),  INT8_C(  77), -INT8_C(  31), -INT8_C(  32), -INT8_C(  24),  INT8_C(  93), -INT8_C( 104), -INT8_C(  86) },
      { -INT8_C(  36),  INT8_C(  50),  INT8_C(  91), -INT8_C(  45), -INT8_C( 107),  INT8_C( 107),  INT8_C(  63),  INT8_C(  79),
         INT8_C(  28), -INT8_C(  22),  INT8_C(  26), -INT8_C(  68), -INT8_C(  77),  INT8_C( 125), -INT8_C(   8), -INT8_C( 123) },
      { -INT32_C(  1315268233),  INT32_C(  1813006454), -INT32_C(   878008646),  INT32_C(   996420307) } },
    { {  INT32_C(   555031355), -INT32_C(  1424575996), -INT32_C(   410257409),  INT32_C(   831595604) },
      {  INT8_C(  86), -INT8_C(  19),  INT8_C(   4), -INT8_C(  21),  INT8_C(  88),  INT8_C(  68),  INT8_C(  58),  INT8_C( 117),
         INT8_C(  46),  INT8_C(  84),  INT8_C(  49), -INT8_C(  31), -INT8_C(  47),  INT8_C(  41),  INT8_C( 102),  INT8_C(  13) },
      {  INT8_C(  68),  INT8_C( 123),  INT8_C(  46),  INT8_C(  72),  INT8_C(  45),  INT8_C(  69), -INT8_C(  13),  INT8_C(  45),
         INT8_C(  60),      INT8_MAX,  INT8_C(  20), -INT8_C( 111), -INT8_C(  93), -INT8_C(  90), -INT8_C(  62), -INT8_C(   7) },
      {  INT32_C(   555033538), -INT32_C(  1424562833), -INT32_C(   410239560),  INT32_C(   831589870) } },
    { { -INT32_C(   337262957),  INT32_C(   945823498),  INT32_C(  1159303796),  INT32_C(     5406651) },
      { -INT8_C(   6), -INT8_C( 127),  INT8_C(  72),  INT8_C(  40), -INT8_C(  58),  INT8_C(  60),  INT8_C(  85),  INT8_C(   2),
        -INT8_C(  69),  INT8_C( 105), -INT8_C( 109),  INT8_C(  94),  INT8_C(  15),  INT8_C(  85),  INT8_C(  87), -INT8_C(  94) },
      {  INT8_C(  28),  INT8_C(  60), -INT8_C( 114),  INT8_C(  38),  INT8_C(  92), -INT8_C(  18),  INT8_C(  95), -INT8_C(  48),
             INT8_MIN,  INT8_C( 120),  INT8_C(  21),  INT8_C(  60), -INT8_C(   8),  INT8_C( 104),  INT8_C(  60), -INT8_C(  14) },
      { -INT32_C(   337277433),  INT32_C(   945825061),  INT32_C(  1159328579),  INT32_C(     5421907) } },
    { { -INT32_C(  1357216535),  INT32_C(  2075226048), -INT32_C(   388413991), -INT32_C(  1232391782) },
      {  INT8_C( 109),  INT8_C(  25), -INT8_C(  35), -INT8_C(  55),  INT8_C(   7),  INT8_C(  60), -INT8_C( 103), -INT8_C( 120),
        -INT8_C(  76), -INT8_C(  81), -INT8_C(  60), -INT8_C(  84),  INT8_C(  23),  INT8_C(   0), -INT8_C(  97),  INT8_C(   0) },
      { -INT8_C( 124), -INT8_C(  71), -INT8_C(  81),  INT8_C(  69),  INT8_C(  41),  INT8_C(  96), -INT8_C(  64),  INT8_C(   2),
        -INT8_C(  91), -INT8_C( 102), -INT8_C(  22),  INT8_C(  64), -INT8_C(  53),  INT8_C( 117), -INT8_C(  10),  INT8_C(  56) },
      { -INT32_C(  1357232786),  INT32_C(  2075238447), -INT32_C(   388402869), -INT32_C(  1232392031) } },
    { { -INT32_C(  1778199666), -INT32_C(  1004627185),  INT32_C(  1634787914),  INT32_C(  1717637090) },
      { -INT8_C(  55),  INT8_C(  16), -INT8_C(  85), -INT8_C(  14),  INT8_C( 113),  INT8_C( 108), -INT8_C(  12),  INT8_C(  22),
         INT8_C(   6), -INT8_C(  34),  INT8_C(  86), -INT8_C(  47),  INT8_C(  84),  INT8_C(  77),  INT8_C(   9), -INT8_C(  30) },
      {  INT8_C(  32),  INT8_C(  11),  INT8_C( 120),  INT8_C(  48), -INT8_C(  89), -INT8_C( 106), -INT8_C(  12), -INT8_C(  15),
         INT8_C( 120),  INT8_C( 100),  INT8_C(  83),  INT8_C(  90),  INT8_C( 116), -INT8_C(  76), -INT8_C(  63),  INT8_C(  61) },
      { -INT32_C(  1778212122), -INT32_C(  1004648876),  INT32_C(  1634788142),  INT32_C(  1717638585) } },
    { {  INT32_C(   909077701), -INT32_C(   565435432),  INT32_C(  1437573889),  INT32_C(   272153072) },
      { -INT8_C(  60), -INT8_C(  80),  INT8_C(  64),  INT8_C( 107),  INT8_C(  71),  INT8_C(  52),  INT8_C(  93), -INT8_C(  65),
        -INT8_C( 103), -INT8_C(  80),  INT8_C(  26),  INT8_C(  13),  INT8_C( 100), -INT8_C(  37),  INT8_C(  74),  INT8_C(  41) },
      {  INT8_C(  71),  INT8_C( 121),  INT8_C(  95),  INT8_C(  32), -INT8_C( 100), -INT8_C(  84), -INT8_C(   2), -INT8_C(  99),
         INT8_C(  79), -INT8_C(  82), -INT8_C(  13),  INT8_C(  63),  INT8_C( 103),  INT8_C(  43),  INT8_C(  79),  INT8_C(  43) },
      {  INT32_C(   909073265), -INT32_C(   565440651),  INT32_C(  1437572793),  INT32_C(   272169390) } },
    { {  INT32_C(   580358363),  INT32_C(  1575154884),  INT32_C(   141229220),  INT32_C(   506639575) },
      {  INT8_C(  45), -INT8_C( 111),  INT8_C(  62), -INT8_C(  55),  INT8_C(  61),  INT8_C(  61),  INT8_C( 103), -INT8_C( 116),
        -INT8_C(  21),  INT8_C(  90), -INT8_C(  53),  INT8_C(  82), -INT8_C( 123),  INT8_C(  27),  INT8_C( 125),  INT8_C(  96) },
      { -INT8_C(  85),  INT8_C(  20), -INT8_C( 125),  INT8_C( 111),  INT8_C(   8),  INT8_C( 101), -INT8_C(  51), -INT8_C(  84),
         INT8_C(  97),  INT8_C(  55), -INT8_C(  75),  INT8_C(  56), -INT8_C(  20), -INT8_C(  25),  INT8_C(  86),  INT8_C(  25) },
      {  INT32_C(   580338463),  INT32_C(  1575166024),  INT32_C(   141240700),  INT32_C(   506654510) } },
    { { -INT32_C(  1226599048), -INT32_C(  1119728942),  INT32_C(   688852644), -INT32_C(   729183191) },
      { -INT8_C(  95),  INT8_C(  12),  INT8_C(  67), -INT8_C(  87),  INT8_C( 113),  INT8_C(  16),  INT8_C(  86), -INT8_C(  46),
         INT8_C(  72),  INT8_C(  11),  INT8_C(  10),  INT8_C(  52), -INT8_C(  14),  INT8_C(  97),  INT8_C(  77),  INT8_C( 106) },
      { -INT8_C(  10),  INT8_C(  48),  INT8_C(  32), -INT8_C(  56),  INT8_C( 122),  INT8_C(  99), -INT8_C( 123),  INT8_C(  30),
         INT8_C( 113), -INT8_C( 108),  INT8_C(  71), -INT8_C( 102),  INT8_C(  32), -INT8_C(  47),  INT8_C( 110), -INT8_C(  63) },
      { -INT32_C(  1226590506), -INT32_C(  1119725530),  INT32_C(   688854998), -INT32_C(   729186406) } },

  };

  for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
    int32x4_t a = vld1q_s32(test_vec[i].a);
    int8x16_t b = vld1q_s8(test_vec[i].b);
    int8x16_t c = vld1q_s8(test_vec[i].c);
    int32x4_t r = vdotq_s32(a, b, c);
    int32_t r_[4];
    vst1q_s32(r_, r);
    assert(memcmp(r_, test_vec[i].r, sizeof(int32_t) * 4) == 0);
  }

  return 0;
}

int main() {
        printf("Testing NEON Wasm SIMD\n");
        test_simde_vdotq_s32();
        test_simde_vaddq_s32();
        test_simde_vsubq_s16();
        test_simde_vmulq_u32();
        test_simde_vbslq_s8();
        test_simde_vshl_s64();
        printf("Success!\n");
}
