大佬教程收集整理的这篇文章主要介绍了c – AVX2比Haswell上的SSE慢,大佬教程大佬觉得挺不错的,现在分享给大家,也给大家做个参考。
int testSSE(const aligned_vector & ghs,const aligned_vector & lhs) { int result[4] __attribute__((aligned(16))) = {0}; __m128i vresult = _mm_set1_epi32(0); __m128i v1,v2,vmax; for (int k = 0; k < ghs.size(); k += 4) { v1 = _mm_load_si128((__m128i *) & lhs[k]); v2 = _mm_load_si128((__m128i *) & ghs[k]); vmax = _mm_add_epi32(v1,v2); vresult = _mm_max_epi32(vresult,vmaX); } _mm_store_si128((__m128i *) result,vresult); int mymax = result[0]; for (int k = 1; k < 4; k++) { if (result[k] > mymaX) { mymax = result[k]; } } return mymax; } int testAVX(const aligned_vector & ghs,const aligned_vector & lhs) { int result[8] __attribute__((aligned(32))) = {0}; __m256i vresult = _mm256_set1_epi32(0); __m256i v1,vmax; for (int k = 0; k < ghs.size(); k += 8) { v1 = _mm256_load_si256((__m256i *) & ghs[ k]); v2 = _mm256_load_si256((__m256i *) & lhs[k]); vmax = _mm256_add_epi32(v1,v2); vresult = _mm256_max_epi32(vresult,vmaX); } _mm256_store_si256((__m256i *) result,vresult); int mymax = result[0]; for (int k = 1; k < 8; k++) { if (result[k] > mymaX) { mymax = result[k]; } } return mymax; } int testNormal(const aligned_vector & ghs,const aligned_vector & lhs) { int max = 0; int tempMax; for (int k = 0; k < ghs.size(); k++) { tempMax = lhs[k] + ghs[k]; if (max < tempMaX) { max = tempMax; } } return max; }
void alignTestSSE() { aligned_vector lhs; aligned_vector ghs; int mySize = 4096; int FinalResult; int nofTESTCases = 1000; double time,time1,time2,time3; vector<int> lhs2; vector<int> ghs2; lhs.resize(mySizE); ghs.resize(mySizE); lhs2.resize(mySizE); ghs2.resize(mySizE); srand(1); for (int k = 0; k < mySize; k++) { lhs[k] = randomNodEID(1000000); lhs2[k] = lhs[k]; ghs[k] = randomNodEID(1000000); ghs2[k] = ghs[k]; } /* Warming UP */ for (int k = 0; k < nofTESTCases; k++) { FinalResult = testNormal(lhs,ghs); } for (int k = 0; k < nofTESTCases; k++) { FinalResult = testSSE(lhs,ghs); } for (int k = 0; k < nofTESTCases; k++) { FinalResult = testAVX(lhs,ghs); } cout << "===========================" << endl; time = timestamp(); for (int k = 0; k < nofTESTCases; k++) { FinalResult = testSSE(lhs,ghs); } time = timestamp() - time; time1 = time; cout << "SSE took " << time << " s" << endl; cout << "SSE Result: " << FinalResult << endl; time = timestamp(); for (int k = 0; k < nofTESTCases; k++) { FinalResult = testAVX(lhs,ghs); } time = timestamp() - time; time3 = time; cout << "AVX took " << time << " s" << endl; cout << "AVX Result: " << FinalResult << endl; time = timestamp(); for (int k = 0; k < nofTESTCases; k++) { FinalResult = testNormal(lhs,ghs); } time = timestamp() - time; cout << "Normal took " << time << " s" << endl; cout << "Normal Result: " << FinalResult << endl; cout << "SpeedUP SSE= " << time / time1 << " s" << endl; cout << "SpeedUP AVX= " << time / time3 << " s" << endl; cout << "===========================" << endl; ghs.clear(); lhs.clear(); }
哪里
inline double timestamp() { struct timeval tp; gettimeofday(&tp,null); return double(tp.tv_seC) + tp.tv_usec / 1000000.; }
和
typedef vector<int,aligned_allocator<int,sizeof (int)> > aligned_vector;
是使用https://gist.github.com/donny-dont/1471329的AlignedAllocator的对齐矢量
我有一个intel-i7 haswell 4771,以及最新的Ubuntu 14.04 64bit和gcc 4.8.2.一切都是最新的.我用-march = native -mtune = native -O3 -m64编译.
结果是:
SSE took 0.000375986 s SSE Result: 1982689 AVX took 0.000459909 s AVX Result: 1982689 Normal took 0.00315714 s Normal Result: 1982689 SpeedUP SSE= 8.39696 s SpeedUP AVX= 6.8647 s
这表明完全相同的代码在AVX2上比SSE慢22%.我做错了什么还是这种正常行为?
#include <iostream> using namespace std; #include <sys/time.h> #include <cstdlib> #include <cstdint> #include <immintrin.h> inline double timestamp() { struct timeval tp; gettimeofday(&tp,null); return double(tp.tv_seC) + tp.tv_usec / 1000000.; } int testSSE(const int32_t * ghs,const int32_t * lhs,size_t n) { int result[4] __attribute__((aligned(16))) = {0}; __m128i vresult = _mm_set1_epi32(0); __m128i v1,vmax; for (int k = 0; k < n; k += 4) { v1 = _mm_load_si128((__m128i *) & lhs[k]); v2 = _mm_load_si128((__m128i *) & ghs[k]); vmax = _mm_add_epi32(v1,vresult); int mymax = result[0]; for (int k = 1; k < 4; k++) { if (result[k] > mymaX) { mymax = result[k]; } } return mymax; } int testAVX(const int32_t * ghs,size_t n) { int result[8] __attribute__((aligned(32))) = {0}; __m256i vresult = _mm256_set1_epi32(0); __m256i v1,vmax; for (int k = 0; k < n; k += 8) { v1 = _mm256_load_si256((__m256i *) & ghs[k]); v2 = _mm256_load_si256((__m256i *) & lhs[k]); vmax = _mm256_add_epi32(v1,vresult); int mymax = result[0]; for (int k = 1; k < 8; k++) { if (result[k] > mymaX) { mymax = result[k]; } } return mymax; } int testNormal(const int32_t * ghs,size_t n) { int max = 0; int tempMax; for (int k = 0; k < n; k++) { tempMax = lhs[k] + ghs[k]; if (max < tempMaX) { max = tempMax; } } return max; } void alignTestSSE() { int n = 4096; int normalResult,sseResult,avxResult; int nofTESTCases = 1000; double time,normalTime,sseTime,avxTime; int lhs[n] __attribute__ ((aligned(32))); int ghs[n] __attribute__ ((aligned(32))); for (int k = 0; k < n; k++) { lhs[k] = arc4random(); ghs[k] = arc4random(); } /* Warming UP */ for (int k = 0; k < nofTESTCases; k++) { normalResult = testNormal(lhs,ghs,n); } for (int k = 0; k < nofTESTCases; k++) { sseResult = testSSE(lhs,n); } for (int k = 0; k < nofTESTCases; k++) { avxResult = testAVX(lhs,n); } time = timestamp(); for (int k = 0; k < nofTESTCases; k++) { normalResult = testNormal(lhs,n); } normalTime = timestamp() - time; time = timestamp(); for (int k = 0; k < nofTESTCases; k++) { sseResult = testSSE(lhs,n); } sseTime = timestamp() - time; time = timestamp(); for (int k = 0; k < nofTESTCases; k++) { avxResult = testAVX(lhs,n); } avxTime = timestamp() - time; cout << "===========================" << endl; cout << "Normal took " << normalTime << " s" << endl; cout << "Normal Result: " << normalResult << endl; cout << "SSE took " << sseTime << " s" << endl; cout << "SSE Result: " << sseResult << endl; cout << "AVX took " << avxTime << " s" << endl; cout << "AVX Result: " << avxResult << endl; cout << "SpeedUP SSE= " << normalTime / sseTime << endl; cout << "SpeedUP AVX= " << normalTime / avxTime << endl; cout << "===========================" << endl; } int main() { alignTestSSE(); return 0; }
测试:
$clang++ -Wall -mavx2 -O3 -fno-vectorize SO_avx.cpp && ./a.out =========================== Normal took 0.00324106 s Normal Result: 2143749391 SSE took 0.000527859 s SSE Result: 2143749391 AVX took 0.000221968 s AVX Result: 2143749391 SpeedUP SSE= 6.14002 SpeedUP AVX= 14.6015 ===========================
我建议你尝试上面的代码,使用-fno-vectorize(或-fno-tree-vectorize,如果使用g),看看你是否得到类似的结果.如果您这样做,那么您可以向后查找原始代码,以查看可能出现的不一致之处.
以上是大佬教程为你收集整理的c – AVX2比Haswell上的SSE慢全部内容,希望文章能够帮你解决c – AVX2比Haswell上的SSE慢所遇到的程序开发问题。
如果觉得大佬教程网站内容还不错,欢迎将大佬教程推荐给程序员好友。
本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
如您有任何意见或建议可联系处理。小编QQ:384754419,请注明来意。