g++ 编译sse出错
时间:2010-06-27
来源:互联网
代码如下:
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <xmmintrin.h>
using namespace std;
typedef union{
__m128 m;
float f[4];
} sseTemp;
float dotWithSSE(float *a, float *b, const int len){
int step = len/4;
__m128* one = (__m128*)a;
__m128* two = (__m128*)b;
if(NULL == one || NULL == two){
printf("%d\n", __LINE__);
exit(1);
}
__m128 result = _mm_setzero_ps();
__m128 temp = _mm_set_ps(0.1f, 0.1f, 0.1f, 0.1f);
for(int i = 0; i < step -1; i++){
temp = _mm_add_ps(*one, *two);
temp = _mm_mul_ps(temp, temp);
result = _mm_add_ps(result, temp);
}
sseTemp s;
s.m = result;
float r = 0.0f;
r += s.f[0] + s.f[1] + s.f[2] + s.f[3];
if(0 != (len & 3)){
for(int i = 4*step; i < len; i++){
r += (a[i] - b[i])*(a[i] - b[i]);
}
}
return r;
}
float dotWithoutSSE(float* __restrict__ a, float* __restrict__ b, const int len){
float result = 0.0f, temp;
for(int i = 0; i < len; i++){
temp = (a[i] - b[i]);
result += temp*temp;
}
return result;
}
int runTest(const int len){
float *a, *b;
a = (float*) malloc(sizeof(float)*len);
b = (float*) malloc(sizeof(float)*len);
if(NULL == a || NULL == b){
printf("malloc fail\n");
exit(1);
}
for(int i = 0; i < len; i++){
a[i] = 1.0f*rand()/RAND_MAX;
b[i] = 1.0f*rand()/RAND_MAX;
}
clock_t start = clock();
float sse = dotWithSSE(a, b, len);
clock_t end = clock();
clock_t timeSSE = end - start;
start = clock();
float withoutSSE = dotWithoutSSE(a, b, len);
end = clock();
clock_t timeWithoutSSE = end - start;
if(fabs(sse - withoutSSE) < 0.00001){
printf("test pass\n");
}
printf("withSSE = %f, wihtoutSSE = %f", sse, withoutSSE);
free(a);
free(b);
}
int main(){
runTest(1000);
return 0;
}
使用gcc测试时,代码 temp = _mm_add_ps(*one, *two);越界,这另我百思不得其解,先谢谢了!
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <xmmintrin.h>
using namespace std;
typedef union{
__m128 m;
float f[4];
} sseTemp;
float dotWithSSE(float *a, float *b, const int len){
int step = len/4;
__m128* one = (__m128*)a;
__m128* two = (__m128*)b;
if(NULL == one || NULL == two){
printf("%d\n", __LINE__);
exit(1);
}
__m128 result = _mm_setzero_ps();
__m128 temp = _mm_set_ps(0.1f, 0.1f, 0.1f, 0.1f);
for(int i = 0; i < step -1; i++){
temp = _mm_add_ps(*one, *two);
temp = _mm_mul_ps(temp, temp);
result = _mm_add_ps(result, temp);
}
sseTemp s;
s.m = result;
float r = 0.0f;
r += s.f[0] + s.f[1] + s.f[2] + s.f[3];
if(0 != (len & 3)){
for(int i = 4*step; i < len; i++){
r += (a[i] - b[i])*(a[i] - b[i]);
}
}
return r;
}
float dotWithoutSSE(float* __restrict__ a, float* __restrict__ b, const int len){
float result = 0.0f, temp;
for(int i = 0; i < len; i++){
temp = (a[i] - b[i]);
result += temp*temp;
}
return result;
}
int runTest(const int len){
float *a, *b;
a = (float*) malloc(sizeof(float)*len);
b = (float*) malloc(sizeof(float)*len);
if(NULL == a || NULL == b){
printf("malloc fail\n");
exit(1);
}
for(int i = 0; i < len; i++){
a[i] = 1.0f*rand()/RAND_MAX;
b[i] = 1.0f*rand()/RAND_MAX;
}
clock_t start = clock();
float sse = dotWithSSE(a, b, len);
clock_t end = clock();
clock_t timeSSE = end - start;
start = clock();
float withoutSSE = dotWithoutSSE(a, b, len);
end = clock();
clock_t timeWithoutSSE = end - start;
if(fabs(sse - withoutSSE) < 0.00001){
printf("test pass\n");
}
printf("withSSE = %f, wihtoutSSE = %f", sse, withoutSSE);
free(a);
free(b);
}
int main(){
runTest(1000);
return 0;
}
使用gcc测试时,代码 temp = _mm_add_ps(*one, *two);越界,这另我百思不得其解,先谢谢了!
作者: yyfn风辰 发布时间: 2010-06-27
好像是地址没对齐,SSE 要求 16-Byte 对齐的。
作者: 变异老鼠 发布时间: 2010-06-27
回复 变异老鼠
谢谢你!
那在gcc中怎么将malloc分配的空间对齐到16字节啊?我试过__attribute__(aligned(16))不行啊!
谢谢你!
那在gcc中怎么将malloc分配的空间对齐到16字节啊?我试过__attribute__(aligned(16))不行啊!
作者: yyfn风辰 发布时间: 2010-06-27
相关阅读 更多
热门阅读
-
office 2019专业增强版最新2021版激活秘钥/序列号/激活码推荐 附激活工具
阅读:74
-
如何安装mysql8.0
阅读:31
-
Word快速设置标题样式步骤详解
阅读:28
-
20+道必知必会的Vue面试题(附答案解析)
阅读:37
-
HTML如何制作表单
阅读:22
-
百词斩可以改天数吗?当然可以,4个步骤轻松修改天数!
阅读:31
-
ET文件格式和XLS格式文件之间如何转化?
阅读:24
-
react和vue的区别及优缺点是什么
阅读:121
-
支付宝人脸识别如何关闭?
阅读:21
-
腾讯微云怎么修改照片或视频备份路径?
阅读:28