110 lines
3.7 KiB
Diff
110 lines
3.7 KiB
Diff
From 903b77defb6f2ee2552c06472339f33091e3c7b4 Mon Sep 17 00:00:00 2001
|
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
Date: Tue, 21 Mar 2017 10:59:31 -0700
|
|
Subject: x86-64: Improve branch predication in _dl_runtime_resolve_avx512_opt
|
|
[BZ #21258]
|
|
|
|
On Skylake server, _dl_runtime_resolve_avx512_opt is used to preserve
|
|
the first 8 vector registers. The code layout is
|
|
|
|
if only %xmm0 - %xmm7 registers are used
|
|
preserve %xmm0 - %xmm7 registers
|
|
if only %ymm0 - %ymm7 registers are used
|
|
preserve %ymm0 - %ymm7 registers
|
|
preserve %zmm0 - %zmm7 registers
|
|
|
|
Branch predication always executes the fallthrough code path to preserve
|
|
%zmm0 - %zmm7 registers speculatively, even though only %xmm0 - %xmm7
|
|
registers are used. This leads to lower CPU frequency on Skylake
|
|
server. This patch changes the fallthrough code path to preserve
|
|
%xmm0 - %xmm7 registers instead:
|
|
|
|
if whole %zmm0 - %zmm7 registers are used
|
|
preserve %zmm0 - %zmm7 registers
|
|
if only %ymm0 - %ymm7 registers are used
|
|
preserve %ymm0 - %ymm7 registers
|
|
preserve %xmm0 - %xmm7 registers
|
|
|
|
Tested on Skylake server.
|
|
|
|
[BZ #21258]
|
|
* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
|
|
Define only if _dl_runtime_resolve is defined to
|
|
_dl_runtime_resolve_sse_vex.
|
|
* sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
|
|
Fallthrough to _dl_runtime_resolve_sse_vex.
|
|
|
|
(cherry picked from commit c15f8eb50cea7ad1a4ccece6e0982bf426d52c00)
|
|
|
|
diff --git a/ChangeLog b/ChangeLog
|
|
index b37a054bae..8479fba8c4 100644
|
|
--- a/ChangeLog
|
|
+++ b/ChangeLog
|
|
@@ -1,3 +1,12 @@
|
|
+2017-04-07 H.J. Lu <hongjiu.lu@intel.com>
|
|
+
|
|
+ [BZ #21258]
|
|
+ * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve_opt):
|
|
+ Define only if _dl_runtime_resolve is defined to
|
|
+ _dl_runtime_resolve_sse_vex.
|
|
+ * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_opt):
|
|
+ Fallthrough to _dl_runtime_resolve_sse_vex.
|
|
+
|
|
2017-04-03 Mike Frysinger <vapier@gentoo.org>
|
|
|
|
[BZ #21253]
|
|
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
|
|
index 33d7fcf7d0..c14c61aa58 100644
|
|
--- a/sysdeps/x86_64/dl-trampoline.S
|
|
+++ b/sysdeps/x86_64/dl-trampoline.S
|
|
@@ -87,11 +87,9 @@
|
|
#endif
|
|
#define VEC(i) zmm##i
|
|
#define _dl_runtime_resolve _dl_runtime_resolve_avx512
|
|
-#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
|
|
#define _dl_runtime_profile _dl_runtime_profile_avx512
|
|
#include "dl-trampoline.h"
|
|
#undef _dl_runtime_resolve
|
|
-#undef _dl_runtime_resolve_opt
|
|
#undef _dl_runtime_profile
|
|
#undef VEC
|
|
#undef VMOV
|
|
@@ -145,4 +143,5 @@
|
|
# define VMOV vmovdqu
|
|
#endif
|
|
#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex
|
|
+#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
|
|
#include "dl-trampoline.h"
|
|
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
|
|
index b27fa06974..8db24c16ac 100644
|
|
--- a/sysdeps/x86_64/dl-trampoline.h
|
|
+++ b/sysdeps/x86_64/dl-trampoline.h
|
|
@@ -129,19 +129,20 @@ _dl_runtime_resolve_opt:
|
|
# YMM state isn't in use.
|
|
PRESERVE_BND_REGS_PREFIX
|
|
jz _dl_runtime_resolve_sse_vex
|
|
-# elif VEC_SIZE == 64
|
|
+# elif VEC_SIZE == 16
|
|
# For ZMM registers, check if YMM state and ZMM state are in
|
|
# use.
|
|
andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
|
|
cmpl $bit_YMM_state, %r11d
|
|
- # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
|
|
- # neither YMM state nor ZMM state are in use.
|
|
+ # Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
|
|
PRESERVE_BND_REGS_PREFIX
|
|
- jl _dl_runtime_resolve_sse_vex
|
|
+ jg _dl_runtime_resolve_avx512
|
|
# Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
|
|
# ZMM state isn't in use.
|
|
PRESERVE_BND_REGS_PREFIX
|
|
je _dl_runtime_resolve_avx
|
|
+ # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
|
|
+ # neither YMM state nor ZMM state are in use.
|
|
# else
|
|
# error Unsupported VEC_SIZE!
|
|
# endif
|
|
--
|
|
2.13.1
|
|
|