C++ vs Java?为什么ICC生成的代码比VC慢?[已关闭]编辑:更新OP的结果:
以下是C++中的一个简单循环。计时器使用的是 QueryPerformanceCounter(), 并且非常准确。我发现Java占用了60%的时间C++,而这不可能?!我在这里做错了什么?即使是严格的混叠(此处的代码中未包含)也没有任何帮助...
long long var = 0;
std::array<int, 1024> arr;
int* arrPtr = arr.data();
CHighPrecisionTimer timer;
for(int i = 0; i < 1024; i++) arrPtr[i] = i;
timer.Start();
for(int i = 0; i < 1024 * 1024 * 10; i++){
for(int x = 0; x < 1024; x++){
var += arrPtr[x];
}
}
timer.Stop();
printf("Unrestricted: %lld us, Value = %lld\n", (Int64)timer.GetElapsed().GetMicros(), var);
此C++在大约 9.5 秒内完成。我正在使用具有主机处理器优化功能的英特尔编译器 12.1(专门针对我的编译器)以及所有已达到最大值的功能。这就是英特尔编译器的最佳状态!自动并行化可笑地消耗了70%的CPU而不是25%,但不能更快地完成工作;)...
现在我使用以下Java代码进行比较:
long var = 0;
int[] arr = new int[1024];
for(int i = 0; i < 1024; i++) arr[i] = i;
for(int i = 0; i < 1024 * 1024; i++){
for(int x = 0; x < 1024; x++){
var += arr[x];
}
}
long nanos = System.nanoTime();
for(int i = 0; i < 1024 * 1024 * 10; i++){
for(int x = 0; x < 1024; x++){
var += arr[x];
}
}
nanos = (System.nanoTime() - nanos) / 1000;
System.out.print("Value: " + var + ", Time: " + nanos);
Java 代码通过主动优化和服务器 VM(无调试)调用。它在我的计算机上运行大约7秒(仅使用一个线程)。
这是英特尔编译器的失败,还是我又太笨了?
[编辑]:好吧,现在这里是事情...似乎更像是英特尔编译器^^中的一个错误。[请注意,我运行的是英特尔四核 Q6600,它相当旧。英特尔编译器在最近的CPU(如Core i7)上的表现可能更好。
Intel x86 (without vectorization): 3 seconds
MSVC x64: 5 seconds
Java x86/x64 (Oracle Java 7): 7 seconds
Intel x64 (with vectorization): 9.5 seconds
Intel x86 (with vectorization): 9.5 seconds
Intel x64 (without vectorization): 12 seconds
MSVC x86: 15 seconds (uhh)
[编辑]:另一个很好的例子;)。请考虑以下简单的 lambda 表达式
#include <stdio.h>
#include <tchar.h>
#include <Windows.h>
#include <vector>
#include <boost/function.hpp>
#include <boost/lambda/bind.hpp>
#include <boost/typeof/typeof.hpp>
template<class TValue>
struct ArrayList
{
private:
std::vector<TValue> m_Entries;
public:
template<class TCallback>
void Foreach(TCallback inCallback)
{
for(int i = 0, size = m_Entries.size(); i < size; i++)
{
inCallback(i);
}
}
void Add(TValue inValue)
{
m_Entries.push_back(inValue);
}
};
int _tmain(int argc, _TCHAR* argv[])
{
auto t = [&]() {};
ArrayList<int> arr;
int res = 0;
for(int i = 0; i < 100; i++)
{
arr.Add(i);
}
long long freq, t1, t2;
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
for(int i = 0; i < 1000 * 1000 * 10; i++)
{
arr.Foreach([&](int v) {
res += i;
});
}
QueryPerformanceCounter((LARGE_INTEGER*)&t2);
printf("Time: %lld\n", ((t2-t1) * 1000000) / freq);
if(res == 4950)
return -1;
return 0;
}
英特尔编译器再次大放异彩:
MSVC x86/x64: 12 milli seconds
Intel x86/x64: 1 second
嗯?!好吧,我想慢90倍并不是一件坏事...
我不再真正确定这是否适用:好吧,基于这个线程的答案:英特尔编译器是已知的(我也知道这一点,但我只是没有想到他们可以放弃对处理器的支持),在编译器不“知道”的处理器上具有糟糕的性能,如AMD处理器, 甚至可能像我一样过时的英特尔处理器...因此,如果拥有最新英特尔处理器的人可以尝试一下,那就太好了;)。
以下是英特尔编译器的 x64 输出:
std::array<int, 1024> arr;
int* arrPtr = arr.data();
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
000000013F05101D lea rcx,[freq]
000000013F051022 call qword ptr [__imp_QueryPerformanceFrequency (13F052000h)]
for(int i = 0; i < 1024; i++) arrPtr[i] = i;
000000013F051028 mov eax,4
000000013F05102D movd xmm0,eax
000000013F051031 xor eax,eax
000000013F051033 pshufd xmm1,xmm0,0
000000013F051038 movdqa xmm0,xmmword ptr [__xi_z+28h (13F0521A0h)]
000000013F051040 movdqa xmmword ptr arr[rax*4],xmm0
000000013F051046 paddd xmm0,xmm1
000000013F05104A movdqa xmmword ptr [rsp+rax*4+60h],xmm0
000000013F051050 paddd xmm0,xmm1
000000013F051054 movdqa xmmword ptr [rsp+rax*4+70h],xmm0
000000013F05105A paddd xmm0,xmm1
000000013F05105E movdqa xmmword ptr [rsp+rax*4+80h],xmm0
000000013F051067 add rax,10h
000000013F05106B paddd xmm0,xmm1
000000013F05106F cmp rax,400h
000000013F051075 jb wmain+40h (13F051040h)
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
000000013F051077 lea rcx,[t1]
000000013F05107C call qword ptr [__imp_QueryPerformanceCounter (13F052008h)]
var += arrPtr[x];
000000013F051082 movdqa xmm1,xmmword ptr [__xi_z+38h (13F0521B0h)]
for(int i = 0; i < 1024 * 1024 * 10; i++){
000000013F05108A xor eax,eax
var += arrPtr[x];
000000013F05108C movdqa xmm0,xmmword ptr [__xi_z+48h (13F0521C0h)]
long long var = 0, freq, t1, t2;
000000013F051094 pxor xmm6,xmm6
for(int x = 0; x < 1024; x++){
000000013F051098 xor r8d,r8d
var += arrPtr[x];
000000013F05109B lea rdx,[arr]
000000013F0510A0 xor ecx,ecx
000000013F0510A2 movq xmm2,mmword ptr arr[rcx]
for(int x = 0; x < 1024; x++){
000000013F0510A8 add r8,8
var += arrPtr[x];
000000013F0510AC punpckldq xmm2,xmm2
for(int x = 0; x < 1024; x++){
000000013F0510B0 add rcx,20h
var += arrPtr[x];
000000013F0510B4 movdqa xmm3,xmm2
000000013F0510B8 pand xmm2,xmm0
000000013F0510BC movq xmm4,mmword ptr [rdx+8]
000000013F0510C1 psrad xmm3,1Fh
000000013F0510C6 punpckldq xmm4,xmm4
000000013F0510CA pand xmm3,xmm1
000000013F0510CE por xmm3,xmm2
000000013F0510D2 movdqa xmm5,xmm4
000000013F0510D6 movq xmm2,mmword ptr [rdx+10h]
000000013F0510DB psrad xmm5,1Fh
000000013F0510E0 punpckldq xmm2,xmm2
000000013F0510E4 pand xmm5,xmm1
000000013F0510E8 paddq xmm6,xmm3
000000013F0510EC pand xmm4,xmm0
000000013F0510F0 movdqa xmm3,xmm2
000000013F0510F4 por xmm5,xmm4
000000013F0510F8 psrad xmm3,1Fh
000000013F0510FD movq xmm4,mmword ptr [rdx+18h]
000000013F051102 pand xmm3,xmm1
000000013F051106 punpckldq xmm4,xmm4
000000013F05110A pand xmm2,xmm0
000000013F05110E por xmm3,xmm2
000000013F051112 movdqa xmm2,xmm4
000000013F051116 paddq xmm6,xmm5
000000013F05111A psrad xmm2,1Fh
000000013F05111F pand xmm4,xmm0
000000013F051123 pand xmm2,xmm1
for(int x = 0; x < 1024; x++){
000000013F051127 add rdx,20h
var += arrPtr[x];
000000013F05112B paddq xmm6,xmm3
000000013F05112F por xmm2,xmm4
for(int x = 0; x < 1024; x++){
000000013F051133 cmp r8,400h
var += arrPtr[x];
000000013F05113A paddq xmm6,xmm2
for(int x = 0; x < 1024; x++){
000000013F05113E jb wmain+0A2h (13F0510A2h)
for(int i = 0; i < 1024 * 1024 * 10; i++){
000000013F051144 inc eax
000000013F051146 cmp eax,0A00000h
000000013F05114B jb wmain+98h (13F051098h)
}
}
QueryPerformanceCounter((LARGE_INTEGER*)&t2);
000000013F051151 lea rcx,[t2]
000000013F051156 call qword ptr [__imp_QueryPerformanceCounter (13F052008h)]
printf("Unrestricted: %lld ms, Value = %lld\n", ((t2-t1)*1000/freq), var);
000000013F05115C mov r9,qword ptr [t2]
long long var = 0, freq, t1, t2;
000000013F051161 movdqa xmm0,xmm6
printf("Unrestricted: %lld ms, Value = %lld\n", ((t2-t1)*1000/freq), var);
000000013F051165 sub r9,qword ptr [t1]
000000013F05116A lea rcx,[string "Unrestricted: %lld ms, Value = %"... (13F0521D0h)]
000000013F051171 imul rax,r9,3E8h
000000013F051178 cqo
000000013F05117A mov r10,qword ptr [freq]
000000013F05117F idiv rax,r10
long long var = 0, freq, t1, t2;
000000013F051182 psrldq xmm0,8
printf("Unrestricted: %lld ms, Value = %lld\n", ((t2-t1)*1000/freq), var);
000000013F051187 mov rdx,rax
long long var = 0, freq, t1, t2;
000000013F05118A paddq xmm6,xmm0
000000013F05118E movd r8,xmm6
printf("Unrestricted: %lld ms, Value = %lld\n", ((t2-t1)*1000/freq), var);
000000013F051193 call qword ptr [__imp_printf (13F052108h)]
这是MSVC x64版本的汇编:
int _tmain(int argc, _TCHAR* argv[])
{
000000013FF61000 push rbx
000000013FF61002 mov eax,1050h
000000013FF61007 call __chkstk (13FF61950h)
000000013FF6100C sub rsp,rax
000000013FF6100F mov rax,qword ptr [__security_cookie (13FF63000h)]
000000013FF61016 xor rax,rsp
000000013FF61019 mov qword ptr [rsp+1040h],rax
long long var = 0, freq, t1, t2;
std::array<int, 1024> arr;
int* arrPtr = arr.data();
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
000000013FF61021 lea rcx,[rsp+28h]
000000013FF61026 xor ebx,ebx
000000013FF61028 call qword ptr [__imp_QueryPerformanceFrequency (13FF62000h)]
for(int i = 0; i < 1024; i++) arrPtr[i] = i;
000000013FF6102E xor r11d,r11d
000000013FF61031 lea rax,[rsp+40h]
000000013FF61036 mov dword ptr [rax],r11d
000000013FF61039 inc r11d
000000013FF6103C add rax,4
000000013FF61040 cmp r11d,400h
000000013FF61047 jl wmain+36h (13FF61036h)
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
000000013FF61049 lea rcx,[rsp+20h]
000000013FF6104E call qword ptr [__imp_QueryPerformanceCounter (13FF62008h)]
000000013FF61054 mov r11d,0A00000h
000000013FF6105A nop word ptr [rax+rax]
for(int i = 0; i < 1024 * 1024 * 10; i++){
for(int x = 0; x < 1024; x++){
000000013FF61060 xor edx,edx
000000013FF61062 xor r8d,r8d
000000013FF61065 lea rcx,[rsp+48h]
000000013FF6106A xor r9d,r9d
000000013FF6106D mov r10d,100h
000000013FF61073 nop word ptr [rax+rax]
var += arrPtr[x];
000000013FF61080 movsxd rax,dword ptr [rcx-8]
000000013FF61084 add rcx,10h
000000013FF61088 add rbx,rax
000000013FF6108B movsxd rax,dword ptr [rcx-14h]
000000013FF6108F add r9,rax
000000013FF61092 movsxd rax,dword ptr [rcx-10h]
000000013FF61096 add r8,rax
000000013FF61099 movsxd rax,dword ptr [rcx-0Ch]
000000013FF6109D add rdx,rax
000000013FF610A0 dec r10
000000013FF610A3 jne wmain+80h (13FF61080h)
for(int i = 0; i < 1024 * 1024 * 10; i++){
for(int x = 0; x < 1024; x++){
000000013FF610A5 lea rax,[rdx+r8]
000000013FF610A9 add rax,r9
000000013FF610AC add rbx,rax
000000013FF610AF dec r11
000000013FF610B2 jne wmain+60h (13FF61060h)
}
}
QueryPerformanceCounter((LARGE_INTEGER*)&t2);
000000013FF610B4 lea rcx,[rsp+30h]
000000013FF610B9 call qword ptr [__imp_QueryPerformanceCounter (13FF62008h)]
printf("Unrestricted: %lld ms, Value = %lld\n", ((t2-t1)*1000/freq), var);
000000013FF610BF mov rax,qword ptr [rsp+30h]
000000013FF610C4 lea rcx,[string "Unrestricted: %lld ms, Value = %"... (13FF621B0h)]
000000013FF610CB sub rax,qword ptr [rsp+20h]
000000013FF610D0 mov r8,rbx
000000013FF610D3 imul rax,rax,3E8h
000000013FF610DA cqo
000000013FF610DC idiv rax,qword ptr [rsp+28h]
000000013FF610E1 mov rdx,rax
000000013FF610E4 call qword ptr [__imp_printf (13FF62138h)]
return 0;
000000013FF610EA xor eax,eax
英特尔编译器配置时未进行矢量化、64 位、最高优化(速度惊人,为 12 秒):
000000013FC0102F lea rcx,[freq]
double var = 0; long long freq, t1, t2;
000000013FC01034 xorps xmm6,xmm6
std::array<double, 1024> arr;
double* arrPtr = arr.data();
QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
000000013FC01037 call qword ptr [__imp_QueryPerformanceFrequency (13FC02000h)]
for(int i = 0; i < 1024; i++) arrPtr[i] = i;
000000013FC0103D mov eax,2
000000013FC01042 mov rdx,100000000h
000000013FC0104C movd xmm0,eax
000000013FC01050 xor eax,eax
000000013FC01052 pshufd xmm1,xmm0,0
000000013FC01057 movd xmm0,rdx
000000013FC0105C nop dword ptr [rax]
000000013FC01060 cvtdq2pd xmm2,xmm0
000000013FC01064 paddd xmm0,xmm1
000000013FC01068 cvtdq2pd xmm3,xmm0
000000013FC0106C paddd xmm0,xmm1
000000013FC01070 cvtdq2pd xmm4,xmm0
000000013FC01074 paddd xmm0,xmm1
000000013FC01078 cvtdq2pd xmm5,xmm0
000000013FC0107C movaps xmmword ptr arr[rax*8],xmm2
000000013FC01081 paddd xmm0,xmm1
000000013FC01085 movaps xmmword ptr [rsp+rax*8+60h],xmm3
000000013FC0108A movaps xmmword ptr [rsp+rax*8+70h],xmm4
000000013FC0108F movaps xmmword ptr [rsp+rax*8+80h],xmm5
000000013FC01097 add rax,8
000000013FC0109B cmp rax,400h
000000013FC010A1 jb wmain+60h (13FC01060h)
QueryPerformanceCounter((LARGE_INTEGER*)&t1);
000000013FC010A3 lea rcx,[t1]
000000013FC010A8 call qword ptr [__imp_QueryPerformanceCounter (13FC02008h)]
for(int i = 0; i < 1024 * 1024 * 10; i++){
000000013FC010AE xor eax,eax
for(int x = 0; x < 1024; x++){
000000013FC010B0 xor edx,edx
var += arrPtr[x];
000000013FC010B2 lea ecx,[rdx+rdx]
for(int x = 0; x < 1024; x++){
000000013FC010B5 inc edx
for(int x = 0; x < 1024; x++){
000000013FC010B7 cmp edx,200h
var += arrPtr[x];
000000013FC010BD addsd xmm6,mmword ptr arr[rcx*8]
000000013FC010C3 addsd xmm6,mmword ptr [rsp+rcx*8+58h]
for(int x = 0; x < 1024; x++){
000000013FC010C9 jb wmain+0B2h (13FC010B2h)
for(int i = 0; i < 1024 * 1024 * 10; i++){
000000013FC010CB inc eax
000000013FC010CD cmp eax,0A00000h
000000013FC010D2 jb wmain+0B0h (13FC010B0h)
}
}
QueryPerformanceCounter((LARGE_INTEGER*)&t2);
000000013FC010D4 lea rcx,[t2]
000000013FC010D9 call qword ptr [__imp_QueryPerformanceCounter (13FC02008h)]
没有矢量化、32 位和最高优化的英特尔编译器(这个显然是现在的赢家,运行时间约为 3 秒,汇编看起来好多了):
00B81088 lea eax,[t1]
00B8108C push eax
00B8108D call dword ptr [__imp__QueryPerformanceCounter@4 (0B82004h)]
00B81093 xor eax,eax
00B81095 pxor xmm0,xmm0
00B81099 movaps xmm1,xmm0
for(int x = 0; x < 1024; x++){
00B8109C xor edx,edx
var += arrPtr[x];
00B8109E addpd xmm0,xmmword ptr arr[edx*8]
00B810A4 addpd xmm1,xmmword ptr [esp+edx*8+40h]
00B810AA addpd xmm0,xmmword ptr [esp+edx*8+50h]
00B810B0 addpd xmm1,xmmword ptr [esp+edx*8+60h]
for(int x = 0; x < 1024; x++){
00B810B6 add edx,8
00B810B9 cmp edx,400h
00B810BF jb wmain+9Eh (0B8109Eh)
for(int i = 0; i < 1024 * 1024 * 10; i++){
00B810C1 inc eax
00B810C2 cmp eax,0A00000h
00B810C7 jb wmain+9Ch (0B8109Ch)
double var = 0; long long freq, t1, t2;
00B810C9 addpd xmm0,xmm1
}
}
QueryPerformanceCounter((LARGE_INTEGER*)&t2);
00B810CD lea eax,[t2]
00B810D1 push eax
00B810D2 movaps xmmword ptr [esp+4],xmm0
00B810D7 call dword ptr [__imp__QueryPerformanceCounter@4 (0B82004h)]
00B810DD movaps xmm0,xmmword ptr [esp]