strcpy优化问题
时间:2011-09-25
来源:互联网
我捣鼓了半天汇编,优化的还没c优化的版本快,郁闷。。。。感觉lodsl 比 movl + add 还慢。。。哎。。。
看反汇编后的字节吗,我用汇编优化的明明更加精简,可就是慢,不知到为什么 。。。
char* strcpy(char* s1, char const* s2)
{
char* s = s1;
if (s1 == s2) return s;
while (1)
{
if (!(s1[0] = s2[0])) break;
if (!(s1[1] = s2[1])) break;
if (!(s1[2] = s2[2])) break;
if (!(s1[3] = s2[3])) break;
s1 += 4;
s2 += 4;
}
return s;
}
char* strcpy(char* s1, char const* s2)
{
tb_size_t edi, esi, eax;
__tb_asm__ __tb_volatile__
(
// 我尝试在前面加入地址对齐处理,感觉没效果,就去了
"1:\n"
" movl (%%esi), %%eax\n" // 之前 lodsl 更慢
" add $4, %%esi\n"
" movl %%eax, %%edx\n"
" testb %%dl, %%dl\n"
" je 2f\n"
" shr $8, %%edx\n"
" testb %%dl, %%dl\n"
" je 2f\n"
" shr $8, %%edx\n"
" testb %%dl, %%dl\n"
" je 2f\n"
" shr $8, %%edx\n"
" testb %%dl, %%dl\n"
" je 2f\n"
" stosl\n"
" jmp 1b\n"
"2:\n"
" stosb\n"
" testb %%al, %%al\n"
" je 3f\n"
" shr $8, %%eax\n"
" jmp 2b\n"
"3:\n"
: "=&S" (esi), "=&D" (edi)
: "0" (s2), "1" (s1)
: "memory", "eax", "edx"
);
return s1;
}
////////////////////////////////////////////////////////////
两者的反汇编河字节吗:
C:
00000000 <tb_strcpy>:
0: 53 push %ebx
1: 8b 4c 24 0c mov 0xc(%esp),%ecx
5: 8b 44 24 08 mov 0x8(%esp),%eax
9: 85 c9 test %ecx,%ecx
b: 74 7b je 88 <tb_strcpy+0x88>
d: 85 c0 test %eax,%eax
f: 74 77 je 88 <tb_strcpy+0x88>
11: 39 c8 cmp %ecx,%eax
13: 74 6a je 7f <tb_strcpy+0x7f>
15: 0f b6 11 movzbl (%ecx),%edx
18: 88 10 mov %dl,(%eax)
1a: 84 d2 test %dl,%dl
1c: 74 61 je 7f <tb_strcpy+0x7f>
1e: 0f b6 51 01 movzbl 0x1(%ecx),%edx
22: 88 50 01 mov %dl,0x1(%eax)
25: 84 d2 test %dl,%dl
27: 74 56 je 7f <tb_strcpy+0x7f>
29: 0f b6 51 02 movzbl 0x2(%ecx),%edx
2d: 88 50 02 mov %dl,0x2(%eax)
30: 84 d2 test %dl,%dl
32: 74 4b je 7f <tb_strcpy+0x7f>
34: 0f b6 59 03 movzbl 0x3(%ecx),%ebx
38: 31 d2 xor %edx,%edx
3a: 88 58 03 mov %bl,0x3(%eax)
3d: 84 db test %bl,%bl
3f: 75 31 jne 72 <tb_strcpy+0x72>
41: eb 3c jmp 7f <tb_strcpy+0x7f>
43: 90 nop
44: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi
48: 0f b6 5c 11 05 movzbl 0x5(%ecx,%edx,1),%ebx
4d: 88 5c 10 05 mov %bl,0x5(%eax,%edx,1)
51: 84 db test %bl,%bl
53: 74 2a je 7f <tb_strcpy+0x7f>
55: 0f b6 5c 11 06 movzbl 0x6(%ecx,%edx,1),%ebx
5a: 88 5c 10 06 mov %bl,0x6(%eax,%edx,1)
5e: 84 db test %bl,%bl
60: 74 1d je 7f <tb_strcpy+0x7f>
62: 0f b6 5c 11 07 movzbl 0x7(%ecx,%edx,1),%ebx
67: 88 5c 10 07 mov %bl,0x7(%eax,%edx,1)
6b: 83 c2 04 add $0x4,%edx
6e: 84 db test %bl,%bl
70: 74 0d je 7f <tb_strcpy+0x7f>
72: 0f b6 5c 11 04 movzbl 0x4(%ecx,%edx,1),%ebx
77: 88 5c 10 04 mov %bl,0x4(%eax,%edx,1)
7b: 84 db test %bl,%bl
7d: 75 c9 jne 48 <tb_strcpy+0x48>
7f: 5b pop %ebx
80: c3 ret
81: 8d b4 26 00 00 00 00 lea 0x0(%esi,%eiz,1),%esi
88: 31 c0 xor %eax,%eax
8a: 5b pop %ebx
8b: c3 ret
asm:
00000000 <tb_strcpy>:
0: 83 ec 08 sub $0x8,%esp
3: 89 34 24 mov %esi,(%esp)
6: 89 7c 24 04 mov %edi,0x4(%esp)
a: 8b 74 24 10 mov 0x10(%esp),%esi
e: 8b 4c 24 0c mov 0xc(%esp),%ecx
12: 85 f6 test %esi,%esi
14: 74 4a je 60 <tb_strcpy+0x60>
16: 85 c9 test %ecx,%ecx
18: 74 46 je 60 <tb_strcpy+0x60>
1a: 89 cf mov %ecx,%edi
1c: 8b 06 mov (%esi),%eax
1e: 83 c6 04 add $0x4,%esi
21: 89 c2 mov %eax,%edx
23: 84 d2 test %dl,%dl
25: 74 18 je 3f <tb_strcpy+0x3f>
27: c1 ea 08 shr $0x8,%edx
2a: 84 d2 test %dl,%dl
2c: 74 11 je 3f <tb_strcpy+0x3f>
2e: c1 ea 08 shr $0x8,%edx
31: 84 d2 test %dl,%dl
33: 74 0a je 3f <tb_strcpy+0x3f>
35: c1 ea 08 shr $0x8,%edx
38: 84 d2 test %dl,%dl
3a: 74 03 je 3f <tb_strcpy+0x3f>
3c: ab stos %eax,%es:(%edi)
3d: eb dd jmp 1c <tb_strcpy+0x1c>
3f: aa stos %al,%es:(%edi)
40: 84 c0 test %al,%al
42: 74 05 je 49 <tb_strcpy+0x49>
44: c1 e8 08 shr $0x8,%eax
47: eb f6 jmp 3f <tb_strcpy+0x3f>
49: 89 c8 mov %ecx,%eax
4b: 8b 34 24 mov (%esp),%esi
4e: 8b 7c 24 04 mov 0x4(%esp),%edi
52: 83 c4 08 add $0x8,%esp
55: c3 ret
56: 8d 76 00 lea 0x0(%esi),%esi
59: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi
60: 31 c9 xor %ecx,%ecx
62: 8b 34 24 mov (%esp),%esi
65: 89 c8 mov %ecx,%eax
67: 8b 7c 24 04 mov 0x4(%esp),%edi
6b: 83 c4 08 add $0x8,%esp
6e: c3 ret
看反汇编后的字节吗,我用汇编优化的明明更加精简,可就是慢,不知到为什么 。。。
char* strcpy(char* s1, char const* s2)
{
char* s = s1;
if (s1 == s2) return s;
while (1)
{
if (!(s1[0] = s2[0])) break;
if (!(s1[1] = s2[1])) break;
if (!(s1[2] = s2[2])) break;
if (!(s1[3] = s2[3])) break;
s1 += 4;
s2 += 4;
}
return s;
}
char* strcpy(char* s1, char const* s2)
{
tb_size_t edi, esi, eax;
__tb_asm__ __tb_volatile__
(
// 我尝试在前面加入地址对齐处理,感觉没效果,就去了
"1:\n"
" movl (%%esi), %%eax\n" // 之前 lodsl 更慢
" add $4, %%esi\n"
" movl %%eax, %%edx\n"
" testb %%dl, %%dl\n"
" je 2f\n"
" shr $8, %%edx\n"
" testb %%dl, %%dl\n"
" je 2f\n"
" shr $8, %%edx\n"
" testb %%dl, %%dl\n"
" je 2f\n"
" shr $8, %%edx\n"
" testb %%dl, %%dl\n"
" je 2f\n"
" stosl\n"
" jmp 1b\n"
"2:\n"
" stosb\n"
" testb %%al, %%al\n"
" je 3f\n"
" shr $8, %%eax\n"
" jmp 2b\n"
"3:\n"
: "=&S" (esi), "=&D" (edi)
: "0" (s2), "1" (s1)
: "memory", "eax", "edx"
);
return s1;
}
////////////////////////////////////////////////////////////
两者的反汇编河字节吗:
C:
00000000 <tb_strcpy>:
0: 53 push %ebx
1: 8b 4c 24 0c mov 0xc(%esp),%ecx
5: 8b 44 24 08 mov 0x8(%esp),%eax
9: 85 c9 test %ecx,%ecx
b: 74 7b je 88 <tb_strcpy+0x88>
d: 85 c0 test %eax,%eax
f: 74 77 je 88 <tb_strcpy+0x88>
11: 39 c8 cmp %ecx,%eax
13: 74 6a je 7f <tb_strcpy+0x7f>
15: 0f b6 11 movzbl (%ecx),%edx
18: 88 10 mov %dl,(%eax)
1a: 84 d2 test %dl,%dl
1c: 74 61 je 7f <tb_strcpy+0x7f>
1e: 0f b6 51 01 movzbl 0x1(%ecx),%edx
22: 88 50 01 mov %dl,0x1(%eax)
25: 84 d2 test %dl,%dl
27: 74 56 je 7f <tb_strcpy+0x7f>
29: 0f b6 51 02 movzbl 0x2(%ecx),%edx
2d: 88 50 02 mov %dl,0x2(%eax)
30: 84 d2 test %dl,%dl
32: 74 4b je 7f <tb_strcpy+0x7f>
34: 0f b6 59 03 movzbl 0x3(%ecx),%ebx
38: 31 d2 xor %edx,%edx
3a: 88 58 03 mov %bl,0x3(%eax)
3d: 84 db test %bl,%bl
3f: 75 31 jne 72 <tb_strcpy+0x72>
41: eb 3c jmp 7f <tb_strcpy+0x7f>
43: 90 nop
44: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi
48: 0f b6 5c 11 05 movzbl 0x5(%ecx,%edx,1),%ebx
4d: 88 5c 10 05 mov %bl,0x5(%eax,%edx,1)
51: 84 db test %bl,%bl
53: 74 2a je 7f <tb_strcpy+0x7f>
55: 0f b6 5c 11 06 movzbl 0x6(%ecx,%edx,1),%ebx
5a: 88 5c 10 06 mov %bl,0x6(%eax,%edx,1)
5e: 84 db test %bl,%bl
60: 74 1d je 7f <tb_strcpy+0x7f>
62: 0f b6 5c 11 07 movzbl 0x7(%ecx,%edx,1),%ebx
67: 88 5c 10 07 mov %bl,0x7(%eax,%edx,1)
6b: 83 c2 04 add $0x4,%edx
6e: 84 db test %bl,%bl
70: 74 0d je 7f <tb_strcpy+0x7f>
72: 0f b6 5c 11 04 movzbl 0x4(%ecx,%edx,1),%ebx
77: 88 5c 10 04 mov %bl,0x4(%eax,%edx,1)
7b: 84 db test %bl,%bl
7d: 75 c9 jne 48 <tb_strcpy+0x48>
7f: 5b pop %ebx
80: c3 ret
81: 8d b4 26 00 00 00 00 lea 0x0(%esi,%eiz,1),%esi
88: 31 c0 xor %eax,%eax
8a: 5b pop %ebx
8b: c3 ret
asm:
00000000 <tb_strcpy>:
0: 83 ec 08 sub $0x8,%esp
3: 89 34 24 mov %esi,(%esp)
6: 89 7c 24 04 mov %edi,0x4(%esp)
a: 8b 74 24 10 mov 0x10(%esp),%esi
e: 8b 4c 24 0c mov 0xc(%esp),%ecx
12: 85 f6 test %esi,%esi
14: 74 4a je 60 <tb_strcpy+0x60>
16: 85 c9 test %ecx,%ecx
18: 74 46 je 60 <tb_strcpy+0x60>
1a: 89 cf mov %ecx,%edi
1c: 8b 06 mov (%esi),%eax
1e: 83 c6 04 add $0x4,%esi
21: 89 c2 mov %eax,%edx
23: 84 d2 test %dl,%dl
25: 74 18 je 3f <tb_strcpy+0x3f>
27: c1 ea 08 shr $0x8,%edx
2a: 84 d2 test %dl,%dl
2c: 74 11 je 3f <tb_strcpy+0x3f>
2e: c1 ea 08 shr $0x8,%edx
31: 84 d2 test %dl,%dl
33: 74 0a je 3f <tb_strcpy+0x3f>
35: c1 ea 08 shr $0x8,%edx
38: 84 d2 test %dl,%dl
3a: 74 03 je 3f <tb_strcpy+0x3f>
3c: ab stos %eax,%es:(%edi)
3d: eb dd jmp 1c <tb_strcpy+0x1c>
3f: aa stos %al,%es:(%edi)
40: 84 c0 test %al,%al
42: 74 05 je 49 <tb_strcpy+0x49>
44: c1 e8 08 shr $0x8,%eax
47: eb f6 jmp 3f <tb_strcpy+0x3f>
49: 89 c8 mov %ecx,%eax
4b: 8b 34 24 mov (%esp),%esi
4e: 8b 7c 24 04 mov 0x4(%esp),%edi
52: 83 c4 08 add $0x8,%esp
55: c3 ret
56: 8d 76 00 lea 0x0(%esi),%esi
59: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi
60: 31 c9 xor %ecx,%ecx
62: 8b 34 24 mov (%esp),%esi
65: 89 c8 mov %ecx,%eax
67: 8b 7c 24 04 mov 0x4(%esp),%edi
6b: 83 c4 08 add $0x8,%esp
6e: c3 ret
作者: waruqi 发布时间: 2011-09-25
现在的编译器的优化已经不错了
作者: MSOKD 发布时间: 2011-09-25
相关阅读 更多
热门阅读
-
office 2019专业增强版最新2021版激活秘钥/序列号/激活码推荐 附激活工具
阅读:74
-
如何安装mysql8.0
阅读:31
-
Word快速设置标题样式步骤详解
阅读:28
-
20+道必知必会的Vue面试题(附答案解析)
阅读:37
-
HTML如何制作表单
阅读:22
-
百词斩可以改天数吗?当然可以,4个步骤轻松修改天数!
阅读:31
-
ET文件格式和XLS格式文件之间如何转化?
阅读:24
-
react和vue的区别及优缺点是什么
阅读:121
-
支付宝人脸识别如何关闭?
阅读:21
-
腾讯微云怎么修改照片或视频备份路径?
阅读:28