For the first time I face one unpleasant behavior of Intel C optimizer - inserting 2 unnecessary LEAs (lines #22 and #40).
Here is the ugly snippet:
; mark_description "Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140"; ; mark_description "726"; ; mark_description "-O3 -QxSSE2 -D_N_YMM -D_N_prefetch_4096 -FAcs"; .B8.3:: 00030 44 8b 22 mov r12d, DWORD PTR [rdx] 00033 44 89 e1 mov ecx, r12d 00036 83 f1 03 xor ecx, 3 00039 41 be ff ff ff ff mov r14d, -1 0003f c1 e1 03 shl ecx, 3 00042 41 bf 01 00 00 00 mov r15d, 1 00048 41 d3 ee shr r14d, cl 0004b 45 33 db xor r11d, r11d 0004e 45 23 e6 and r12d, r14d 00051 49 89 c6 mov r14, rax 00054 44 89 e1 mov ecx, r12d 00057 45 89 e2 mov r10d, r12d 0005a 83 e1 03 and ecx, 3 0005d 45 0f 44 df cmove r11d, r15d 00061 4c 8d 7a 01 lea r15, QWORD PTR [1+rdx] 00065 41 ff cb dec r11d 00068 49 83 e4 03 and r12, 3 0006c 83 f1 03 xor ecx, 3 0006f 41 c1 ea 02 shr r10d, 2 00073 4c 89 dd mov rbp, r11 00076 48 f7 d5 not rbp 00079 4d 2b f2 sub r14, r10 0007c 4c 23 fd and r15, rbp 0007f 4d 23 f3 and r14, r11 00082 4c 23 d5 and r10, rbp 00085 4d 23 e3 and r12, r11 00088 bd 20 00 00 00 mov ebp, 32 0008d d3 ed shr ebp, cl 0008f c4 81 7e 6f 04 3e vmovdqu ymm0, YMMWORD PTR [r14+r15] 00095 49 23 eb and rbp, r11 00098 49 03 ea add rbp, r10 0009b 4b 8d 4c 22 01 lea rcx, QWORD PTR [1+r10+r12] 000a0 48 03 d1 add rdx, rcx 000a3 c5 fe 7f 00 vmovdqu YMMWORD PTR [rax], ymm0 000a7 48 03 c5 add rax, rbp 000aa 49 3b d0 cmp rdx, r8 000ad 72 81 jb .B8.3
Is there any way to tell the compiler to increment srcLOCAL (line #56) at spot?
The C counterpart is this:
unsigned int Decompress (char* ret, char* src, unsigned int srcSize) { char* retLOCAL = ret; char* srcLOCAL = src; char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; unsigned int Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; while (srcLOCAL < srcEndLOCAL) { DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|xx|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OO = 00b means Literal // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-OO=(1+OO)*8-2=14 or 16KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-OO=(1+OO)*8-2=22 or 4MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-OO=(1+OO)*8-2=30 or 1GB // Yamami uses 3 sliding Windows and one YMM register with 'automatic' Match Lengths 32/16/8 or YMM>>(3-OO). // Branchfull: /* DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x03) == 0x00 ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 32); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>2); srcLOCAL+= (DWORDtrio>>2)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>2)) ), 32); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>2)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2 retLOCAL+= 32>>(3-(DWORDtrio & 0x03)); // 8//16/32 */ // Branchless: DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!(DWORDtrio & 0x03); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; srcLOCAL+= 1; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>2))&FlagMASKnegated) ), retLOCAL); SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>2))&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>2))&FlagMASKnegated) ), 32); memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>2))&FlagMASKnegated) ), 32); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>2))&FlagMASK) + ((uint64_t)(32>>(3-(DWORDtrio & 0x03)))&FlagMASKnegated) ; // srcLOCAL+= ((uint64_t)((DWORDtrio>>2)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>2))&FlagMASK) + ((uint64_t)((DWORDtrio&0x03))&FlagMASKnegated) ; } return (unsigned int)(retLOCAL - ret); }
As I see it, if the increment is made right away the whole loop will be with no LEA at all, yes?