Quantcast
Channel: Intel® C++ Compiler
Viewing all articles
Browse latest Browse all 1616

How to enforce pointer incrementation while forbidding compiler to play smart

$
0
0

For the first time I face one unpleasant behavior of Intel C optimizer - inserting 2 unnecessary LEAs (lines #22 and #40).
Here is the ugly snippet:

; mark_description "Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140";
; mark_description "726";
; mark_description "-O3 -QxSSE2 -D_N_YMM -D_N_prefetch_4096 -FAcs";

.B8.3::
  00030 44 8b 22         mov r12d, DWORD PTR [rdx]
  00033 44 89 e1         mov ecx, r12d
  00036 83 f1 03         xor ecx, 3
  00039 41 be ff ff ff
        ff               mov r14d, -1
  0003f c1 e1 03         shl ecx, 3
  00042 41 bf 01 00 00
        00               mov r15d, 1
  00048 41 d3 ee         shr r14d, cl
  0004b 45 33 db         xor r11d, r11d
  0004e 45 23 e6         and r12d, r14d
  00051 49 89 c6         mov r14, rax
  00054 44 89 e1         mov ecx, r12d
  00057 45 89 e2         mov r10d, r12d
  0005a 83 e1 03         and ecx, 3
  0005d 45 0f 44 df      cmove r11d, r15d
  00061 4c 8d 7a 01      lea r15, QWORD PTR [1+rdx]
  00065 41 ff cb         dec r11d
  00068 49 83 e4 03      and r12, 3
  0006c 83 f1 03         xor ecx, 3
  0006f 41 c1 ea 02      shr r10d, 2
  00073 4c 89 dd         mov rbp, r11
  00076 48 f7 d5         not rbp
  00079 4d 2b f2         sub r14, r10
  0007c 4c 23 fd         and r15, rbp
  0007f 4d 23 f3         and r14, r11
  00082 4c 23 d5         and r10, rbp
  00085 4d 23 e3         and r12, r11
  00088 bd 20 00 00 00   mov ebp, 32
  0008d d3 ed            shr ebp, cl
  0008f c4 81 7e 6f 04
        3e               vmovdqu ymm0, YMMWORD PTR [r14+r15]
  00095 49 23 eb         and rbp, r11
  00098 49 03 ea         add rbp, r10
  0009b 4b 8d 4c 22 01   lea rcx, QWORD PTR [1+r10+r12]
  000a0 48 03 d1         add rdx, rcx
  000a3 c5 fe 7f 00      vmovdqu YMMWORD PTR [rax], ymm0
  000a7 48 03 c5         add rax, rbp
  000aa 49 3b d0         cmp rdx, r8
  000ad 72 81            jb .B8.3

Is there any way to tell the compiler to increment srcLOCAL (line #56) at spot?
The C counterpart is this:

unsigned int Decompress (char* ret, char* src, unsigned int srcSize) {
	char* retLOCAL = ret;
	char* srcLOCAL = src;
	char* srcEndLOCAL = src+srcSize;
	unsigned int DWORDtrio;
	unsigned int Flag;
	uint64_t FlagMASK; //=       0xFFFFFFFFFFFFFFFF;
	uint64_t FlagMASKnegated; //=0x0000000000000000;

	while (srcLOCAL < srcEndLOCAL) {
		DWORDtrio = *(unsigned int*)srcLOCAL;
//#ifndef _N_GP
//#ifdef _N_prefetch_4096
//		_mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0);
//#endif
//#endif
// |1stLSB    |2ndLSB  |3rdLSB   |
// -------------------------------
// |OO|xx|xxxx|xxxxxxxx|xxxxxx|xx|
// -------------------------------
// [1bit          16bit]    24bit]
// OO = 00b means Literal
// OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-OO=(1+OO)*8-2=14 or 16KB
// OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-OO=(1+OO)*8-2=22 or  4MB
// OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-OO=(1+OO)*8-2=30 or  1GB
// Yamami uses 3 sliding Windows and one YMM register with 'automatic' Match Lengths 32/16/8 or YMM>>(3-OO).

// Branchfull:
/*
		DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) );
		if ( (DWORDtrio & 0x03) == 0x00 ) {
				#ifdef _N_GP
		memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 32);
				#endif
				#ifdef _N_YMM
		SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL );
				#endif
		retLOCAL+= (DWORDtrio>>2);
		srcLOCAL+= (DWORDtrio>>2)+1;
		} else {
				#ifdef _N_GP
			memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>2)) ), 32);
				#endif
				#ifdef _N_YMM
			SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>2)) ), retLOCAL );
				#endif
		srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2
		retLOCAL+= 32>>(3-(DWORDtrio & 0x03)); // 8//16/32
*/
// Branchless:
		DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) );
		Flag=!(DWORDtrio & 0x03);
		// In here Flag=0|1
		FlagMASKnegated= Flag - 1; // -1|0
		FlagMASK= ~FlagMASKnegated;
		srcLOCAL+= 1;
				#ifdef _N_YMM
//		SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>2))&FlagMASKnegated) ), retLOCAL);
		SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>2))&FlagMASKnegated) ), retLOCAL);
				#endif
				#ifdef _N_GP
//		memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>2))&FlagMASKnegated) ), 32);
		memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>2))&FlagMASKnegated) ), 32);
				#endif
		retLOCAL+= ((uint64_t)((DWORDtrio>>2))&FlagMASK) +   ((uint64_t)(32>>(3-(DWORDtrio & 0x03)))&FlagMASKnegated) ;
//		srcLOCAL+= ((uint64_t)((DWORDtrio>>2)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ;
		srcLOCAL+= ((uint64_t)((DWORDtrio>>2))&FlagMASK) + ((uint64_t)((DWORDtrio&0x03))&FlagMASKnegated) ;
	}
	return (unsigned int)(retLOCAL - ret);
}

As I see it, if the increment is made right away the whole loop will be with no LEA at all, yes?
 


Viewing all articles
Browse latest Browse all 1616

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>