'VPSHUFB applies the wrong mask. Or how to better load unsigned 8bit data as 16bit? [duplicate]

Solution found, edit3;

This is a simple reproducible example, I am unable to figure this one out;
I don't really understand this behavior yet, but I bet it is well documented;
Yet I am baffled I managed to fuck something so simple up;

#include <stdint.h>

int main(int argc, char** argv) {
    // Goal: Convert 8bit data to 16bit
    static const uint8_t ushorta[32] = {
        0x00, 0xFF, 0x01, 0xFF,
        0x02, 0xFF, 0x03, 0xFF,
        0x04, 0xFF, 0x05, 0xFF,
        0x06, 0xFF, 0x07, 0xFF,
        0x08, 0xFF, 0x09, 0xFF,
        0x0A, 0xFF, 0x0B, 0xFF,
        0x0C, 0xFF, 0x0D, 0xFF,
        0x0E, 0xFF, 0x0F, 0xFF
    };
    static const uint8_t ushortb[32] = {
        0x10, 0xFF, 0x11, 0xFF,
        0x12, 0xFF, 0x13, 0xFF,
        0x14, 0xFF, 0x15, 0xFF,
        0x16, 0xFF, 0x17, 0xFF,
        0x18, 0xFF, 0x19, 0xFF,
        0x1A, 0xFF, 0x1B, 0xFF,
        0x1C, 0xFF, 0x1D, 0xFF,
        0x1E, 0xFF, 0x1F, 0xFF
    };

    static const uint8_t in[32] = {
        0x00, 0x01, 0x02, 0x03,
        0x04, 0x05, 0x06, 0x07,
        0x08, 0x09, 0x0A, 0x0B,
        0x0C, 0x0D, 0x0E, 0x0F,
        0x10, 0x11, 0x12, 0x13,
        0x14, 0x15, 0x16, 0x17,
        0x18, 0x19, 0x1A, 0x1B,
        0x1C, 0x1D, 0x1E, 0x1F 
    };
    
    __asm__("VMOVDQU (%0), %%ymm14"::"r" (ushorta) :"%ymm14");
    __asm__("VMOVDQU (%0), %%ymm15"::"r" (ushortb) :"%ymm15");
    __asm__("VMOVDQU (%0), %%ymm9"::"r" (in) :"%ymm9");
    __asm__("int3");
// Intel:
// dst, src, mask
// AT&T:
// mask, src, dst
    __asm__("vpshufb %%ymm15, %%ymm9, %%ymm12":::"%ymm12");
    __asm__("vpshufb %%ymm14, %%ymm9, %%ymm9":::"%ymm9"); // Still broken when commented out
    __asm__("int3");
    // Expected result: First half in ymm9, 2nd half in ymm12, 8bit data converted to 16bit data
    // Actual Result: Mirrored Malformed transform in ymm9 & ymm12;
}

I will move to intrinsics.h next time;

For context, data loaded into ymm14,15,9 is aligned and loads properly for me;

Edit:
little endian ushorta, ushortb, now ready for little endian 16bit values;
But it still doesn't work as evident in GDB/mem dumps;
The 3rd "ymm9" is inline assembly requiring to mark a register that gets trashed(overwrriten);

Edit 2:
GDB register dumps:

Program received signal SIGTRAP, Trace/breakpoint trap.
0x0000555555555146 in main ()
(gdb) p/x $ymm9
$1 = v32_int8 = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 
    0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}
(gdb) p/x $ymm14
$2 = v32_int8 = {0x0, 0xff, 0x1, 0xff, 
    0x2, 0xff, 0x3, 0xff, 0x4, 0xff, 0x5, 0xff, 0x6, 0xff, 0x7, 0xff, 0x8, 0xff, 0x9, 0xff, 0xa, 
    0xff, 0xb, 0xff, 0xc, 0xff, 0xd, 0xff, 0xe, 0xff, 0xf, 0xff}
(gdb) p/x $ymm15
$3 = v32_int8 = {0x10, 0xff, 0x11, 0xff, 
    0x12, 0xff, 0x13, 0xff, 0x14, 0xff, 0x15, 0xff, 0x16, 0xff, 0x17, 0xff, 0x18, 0xff, 0x19, 0xff, 
    0x1a, 0xff, 0x1b, 0xff, 0x1c, 0xff, 0x1d, 0xff, 0x1e, 0xff, 0x1f, 0xff}
(gdb) c
Continuing.

Program received signal SIGTRAP, Trace/breakpoint trap.
0x0000555555555151 in main ()
(gdb) p/x $ymm9
$4 = v16_int16 = {
    0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}
(gdb) p/x $ymm12
$5 = v16_int16 = {
    0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}

Edit3: Working as intended, solution found:

#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>

int main(int argc, char* argv[]) {
    const uint8_t in[32] = {
        0x00, 0x01, 0x02, 0x03,
        0x04, 0x05, 0x06, 0x07,
        0x08, 0x09, 0x0A, 0x0B,
        0x0C, 0x0D, 0x0E, 0x0F,
        0x10, 0x11, 0x12, 0x13,
        0x14, 0x15, 0x16, 0x17,
        0x18, 0x19, 0x1A, 0x1B,
        0x1C, 0x1D, 0x1E, 0x1F 
    };

    __m256i first = _mm256_cvtepu8_epi16(((__m128i*)in)[0]);
    __m256i second = _mm256_cvtepu8_epi16((((__m128i*)in)[1]));
    short* values = (short*) &first;
    short* values2 = (short*) &second;
    for(int i=0; i != 16;i++) {
        if(values[i] != i) {
            printf("lol");
        }
        if(values2[i] != i+16) {
            printf("lol");
        }
    }

    return 0;
}


Solution 1:[1]

On the still broken line, I see ymm9 3 times, instead of 2 as shown on prior line. Data will be loaded in little endian, so ushorta will load as 0xff, 0x0f, 0xff, 0x0e, ... , 0xff, 0x00. To reverse all 32 bytes, the indexes should be: 0x1f, 0x1e, 0x1d, 0x1c, ... 0x01, 0x00. If the intent is not to reverse all 32 bytes, but just pairs, then indexes should be 0x01, 0x00, 0x03, 0x02, ... , 0x1f, 0x1e.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1