本文介绍如何使用opt工具优化llvm ir。
使用之前编写的代码 multiply.c
int mult() {
int a = 5;
int b = 3;
int c = a * b;
return c;
}
执行命令
clang -emit-llvm -S multiply.c -o multiply.ll
生成 multiply.ll
; ModuleID = 'multiply.c'
source_filename = "multiply.c"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"
; Function Attrs: noinline nounwind optnone ssp uwtable
define i32 @mult() #0 {
entry:
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
store i32 5, i32* %a, align 4
store i32 3, i32* %b, align 4
%0 = load i32, i32* %a, align 4
%1 = load i32, i32* %b, align 4
%mul = mul nsw i32 %0, %1
store i32 %mul, i32* %c, align 4
%2 = load i32, i32* %c, align 4
ret i32 %2
}
attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 7.0.0 (trunk 324834)"}
执行命令:
opt -mem2reg -S multiply.ll -o multiply1.ll
输出 multiply1.ll
; ModuleID = 'multiply.ll'
source_filename = "multiply.c"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"
; Function Attrs: noinline nounwind optnone ssp uwtable
define i32 @mult() #0 {
entry:
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
store i32 5, i32* %a, align 4
store i32 3, i32* %b, align 4
%0 = load i32, i32* %a, align 4
%1 = load i32, i32* %b, align 4
%mul = mul nsw i32 %0, %1
store i32 %mul, i32* %c, align 4
%2 = load i32, i32* %c, align 4
ret i32 %2
}
attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 7.0.0 (trunk 324834)"}
我在mac上执行的,不知道为什么结果和书上的不一致,这个优化好像并没有生效。
其他的优化选项还有
-
adce
: Aggressive Dead Code Elimination -
bb-vectorize
: Basic-Block Vectorization -
constprop
: Simple constant propagation -
dce
: Dead Code Elimination -
deadargelim
: Dead Argument Elimination -
globaldce
: Dead Global Elimination -
globalopt
: Global Variable Optimizer -
gvn
: Global Value Numbering -
inline
: Function Integration/Inlining -
instcombine
: Combine redundant instructions -
licm
: Loop Invariant Code Motion -
loop
: unswitch: Unswithch Loop -
loweratomic
: Lower atomic intrinsics to non-atomic form -
lowerinvoke
: Lower invokes to calls, for unwindless code generators -
lowerswitch
: Lower SwithcInsts to branches -
mem2reg
: Promote Memory to Registry -
memcpyopt
: MemCpy Optimization -
simplifycfg
: Simplify the CFG -
sink
: Code sinking -
tailcallelim
: Tail Call Elimination
可以在源码目录 test/Transforms/
下找到测试代码。