视频链接
无意间在B站看到一个up做的awk入门教程,思路清晰,没有废话,所以花了一个多小时学习了下,分了3集:
- 基础
- 内部变量
- 正则表达式
老年人健忘..还是做个笔记吧,下面是视频链接:
awk入门教程-upload by 正月点灯笼
基础
- NR number of record
- NF number of filed
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print NR "\t" $1 "\t" $2 "\t" $3}' tmp1.xls
1 transcript_id Bt_F_FPKM J_F_FPKM
2 Gh_A01G0001 0.424353 0.580204666667
3 Gh_A01G0002 1.372276 1.12152666667
4 Gh_A01G0003 2.755143 2.54099033333
5 Gh_A01G0004 30.8250546667 29.1618696667
6 Gh_A01G0005 8.28325366667 10.730383
7 Gh_A01G0006 13.7739286667 11.6380556667
8 Gh_A01G0007 3.910698 4.715743
9 Gh_A01G0008 0.581012 1.11709433333
10 Gh_A01G0009 6.04281033333 7.47224133333
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print NR, $0}' tmp1.xls
1 transcript_id Bt_F_FPKM J_F_FPKM BtJ_F_FPKM
2 Gh_A01G0001 0.424353 0.580204666667 0.48476
3 Gh_A01G0002 1.372276 1.12152666667 1.22913966667
4 Gh_A01G0003 2.755143 2.54099033333 2.96335666667
5 Gh_A01G0004 30.8250546667 29.1618696667 29.7063626667
6 Gh_A01G0005 8.28325366667 10.730383 10.7914463333
7 Gh_A01G0006 13.7739286667 11.6380556667 13.528866
8 Gh_A01G0007 3.910698 4.715743 4.198037
9 Gh_A01G0008 0.581012 1.11709433333 0.942497666667
10 Gh_A01G0009 6.04281033333 7.47224133333 6.05517166667
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print NF "\t" $0}' tmp1.xls
4 transcript_id Bt_F_FPKM J_F_FPKM BtJ_F_FPKM
4 Gh_A01G0001 0.424353 0.580204666667 0.48476
4 Gh_A01G0002 1.372276 1.12152666667 1.22913966667
4 Gh_A01G0003 2.755143 2.54099033333 2.96335666667
4 Gh_A01G0004 30.8250546667 29.1618696667 29.7063626667
4 Gh_A01G0005 8.28325366667 10.730383 10.7914463333
4 Gh_A01G0006 13.7739286667 11.6380556667 13.528866
4 Gh_A01G0007 3.910698 4.715743 4.198037
4 Gh_A01G0008 0.581012 1.11709433333 0.942497666667
4 Gh_A01G0009 6.04281033333 7.47224133333 6.05517166667
shawnwx@DrdeMacBook-Pro Kmeans$ awk '$1 == "Gh_A01G0007"{print}' tmp1.xls
Gh_A01G0007 3.910698 4.715743 4.198037
# 说明Gh_A01G0007是一个字符串而非变量
内部变量
NR NF FS
# 限定NR == 7打印第7行
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'NR == 7{print}' tmp1.xls
Gh_A01G0006 13.7739286667 11.6380556667 13.528866
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'NF == 4{print}' tmp1.xls
# 限定NF == 7打印filed为7的行
transcript_id Bt_F_FPKM J_F_FPKM BtJ_F_FPKM
Gh_A01G0001 0.424353 0.580204666667 0.48476
Gh_A01G0002 1.372276 1.12152666667 1.22913966667
Gh_A01G0003 2.755143 2.54099033333 2.96335666667
Gh_A01G0004 30.8250546667 29.1618696667 29.7063626667
Gh_A01G0005 8.28325366667 10.730383 10.7914463333
Gh_A01G0006 13.7739286667 11.6380556667 13.528866
Gh_A01G0007 3.910698 4.715743 4.198037
Gh_A01G0008 0.581012 1.11709433333 0.942497666667
Gh_A01G0009 6.04281033333 7.47224133333 6.05517166667
# 如果不加文件名的话bash会进入一个awk命令状态,下面输入什么他就会按照awk命令的指示输出
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print $1, $2}'
hello world
hello world
## awk默认的分隔符为空格
hello, world
hello, world
hello,world 123 456
hello,world 123
# ===============定义全局变量================
## 用BEGIN开始,在BEGIN{}里面用FS定义分隔符为,
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'BEGIN{FS=","} {print $1, $2}'
hello world 123 456
hello world 123 456
## 由于input中用空格分割的,但是刚才定义了分割符是,所以这里把所有的都看成一列
hello,world,123,456
hello world
## awk的输入分隔符和输出分隔符不同,虽然这里修改了默认的输入分隔符为,但是输出分割符仍旧是空格。
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'BEGIN{OFS=","} {print $1, $2}'
hello world 123 456
hello,world
awk 'BEGIN{FS=","; OFS=","} {print $1, $2}'
hello,world,123,456
hello,world
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'BEGIN{FS=","; OFS="\t"} {print $1, $2}'
hello,world,123,456
hello world
# ===============FILENAME==================
## 如果一个awk后面接两个file会自动把file2接到file1下面,无法区分,这时候加上FILENAME就会显示从第几行开始时file2
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print NR, FILENAME, $0}' tmp1.xls tmp2.xls
1 tmp1.xls transcript_id Bt_F_FPKM J_F_FPKM BtJ_F_FPKM
2 tmp1.xls Gh_A01G0001 0.424353 0.580204666667 0.48476
3 tmp1.xls Gh_A01G0002 1.372276 1.12152666667 1.22913966667
4 tmp1.xls Gh_A01G0003 2.755143 2.54099033333 2.96335666667
5 tmp1.xls Gh_A01G0004 30.8250546667 29.1618696667 29.7063626667
6 tmp1.xls Gh_A01G0005 8.28325366667 10.730383 10.7914463333
7 tmp1.xls Gh_A01G0006 13.7739286667 11.6380556667 13.528866
8 tmp1.xls Gh_A01G0007 3.910698 4.715743 4.198037
9 tmp1.xls Gh_A01G0008 0.581012 1.11709433333 0.942497666667
10 tmp1.xls Gh_A01G0009 6.04281033333 7.47224133333 6.05517166667
11 tmp2.xls Gh_Sca277334G01 0 0.0122663333333 0
12 tmp2.xls Gh_Sca278127G01 0 0 0
13 tmp2.xls Gh_Sca278164G01 0 0 0
14 tmp2.xls Gh_Sca280882G01 0 0 0
15 tmp2.xls Gh_Sca283304G01 0.110361333333 0.050783 0.0192516666667
16 tmp2.xls Gh_Sca284875G01 0 0 0
17 tmp2.xls Gh_Sca286293G01 0.445272333333 0.442850333333 0.595937
18 tmp2.xls Gh_Sca286786G01 0 0 0
19 tmp2.xls Gh_Sca287394G01 0 0 0
20 tmp2.xls Gh_Sca288207G01 0 0 0
# ================隐藏某列==============
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{$2="xxx";print $0}' tmp1.xls
transcript_id xxx J_F_FPKM BtJ_F_FPKM
Gh_A01G0001 xxx 0.580204666667 0.48476
Gh_A01G0002 xxx 1.12152666667 1.22913966667
Gh_A01G0003 xxx 2.54099033333 2.96335666667
Gh_A01G0004 xxx 29.1618696667 29.7063626667
Gh_A01G0005 xxx 10.730383 10.7914463333
Gh_A01G0006 xxx 11.6380556667 13.528866
Gh_A01G0007 xxx 4.715743 4.198037
Gh_A01G0008 xxx 1.11709433333 0.942497666667
Gh_A01G0009 xxx 7.47224133333 6.05517166667
# ===============打印文本最后一列=========
# 有些情况下某些列里的字符有空格,空格又是awk默认的分隔符,所以在特殊情况下要打印最后一列并不能用print $具体第几列,或者遇见列数不一致的文本也是这样,所以用print $NF
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print $NF}' tmp1.xls
BtJ_F_FPKM
0.48476
1.22913966667
2.96335666667
29.7063626667
10.7914463333
13.528866
4.198037
0.942497666667
6.05517166667
# 同理,打印倒二列
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print $(NF-1)}' tmp1.xls
J_F_FPKM
0.580204666667
1.12152666667
2.54099033333
29.1618696667
10.730383
11.6380556667
4.715743
1.11709433333
7.47224133333
# =============自定义变量=================================
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a + b}'
4
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a b}'
13
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a - b}'
-2
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a * b}'
3
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a / b}'
0.333333
# 取余
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a % b}'
1
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=2; c=3; print a b+3}'
15
第三讲 Regular Expression 正则表达式
- 书写正则表达式用//
- /abc/ “abc” “xxxabc” "xxabc"
"a bc"
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/abc/{print $0}' tmp3.txt
abc
xxabc
xxabcxx
- /a.c/ 表示一个字母a中间任意字符然后字母c
-
“abc" "aac" "acc" "adc" "a c" "a.c"
"abbc"
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/a.c/{print $0}' tmp3.txt
abc
xxabc
xxabcxx
# .只能代表一个字符位,如果多于一个字符也匹配不到
shawnwx@DrdeMacBook-Pro Kmeans$ cat tmp3.txt
abc
xxabc
xxabcxx
a bc
a b c
ab c
- /a(\).c/ 反斜杠\表示转义字符,没有那个括号,可能和markdown语法冲突了...这样表示精确查找a.c
shawnwx@DrdeMacBook-Pro Kmeans$ vim tmp3.txt
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/a\.c/{print $0}' tmp3.txt
a.c
# 如果提取时遇到例如/\?.等awk认定的特殊字符都需要在前面加上\
/a\/c/
/a/c/
/a\\c/
/a\c/
/a\?c\
/a?c/
- ^ 和 $
- /^abc/表示abc一定要出现在字符串的最前面 “abc" "abcxx" "abcxxxx" "abc...."
"aabc" "dabc" "xxabc" - /abc$/表示abc一定出现在匹配字符串的结尾
“abc" "abcxx" "abcxxxx" "abc....""aabc" "dabc" "xxabc"
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/^abc/{print $0}' tmp3.txt
abc
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/abc$/{print $0}' tmp3.txt
abc
xxabc
- []
- /a[bdf]c/ 表示a和c之间只能是方括号中bdf中间的一个字母 "abc" "adc" "afc"
"aec" "abbc" "a.c" - /a[a-z]c/ 表示a和c之间为a-z(小写)的 "abc" "adc" "afc" "aec" "abbc"
"a.c" "Aac" "aBc" - /a[a-zA-Z]c/ 这样中间可以是任意大小写的字符串"abc" "adc" "afc" "aec" "abbc" "aBc"
"a.c" "Aac" - /a[^a-z]c/ ^如果在[]之内表示取反,这个正则表达式的意思是a和c之间的不是小写
"abc" "adc" "afc" "aec" "abbc""aBc"
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/a[bdf]c/{print $0}' tmp3.txt
abc
xxabc
xxabcxx
- *和+
- /ab/这里的不是通配符的意思,也不是乘号,而是指不限制a的个数a可以是0-无限大个 "b" "ab" "aaab" "aaaaaab"
- /a+b/ 这里的+不是表示加号,而表示a至少出现一次 ** "ab" "aab" "aaaab"**
"b"
- ?
- /a?b/ ?表示a可有可无 ** "ab" "b‘**
”a"
- {}
- /ab{3}c/ 表示b必须出现3次 "abbbc" /ab{3,4}c/表示b可以出现3次也可以出现4次 "abbbc" "abbbbc" /ab{3,}c/这意思是a后面至少有3个b然后是一个c "adfasdfabbbcxxxxx" "abbbc"
9 () - /(ab)+c/ 之前/ab+c/表示b要反复出现,把他们括起来以后表示ab要反复出现 "abababababc"