删除文件中含特定字符串的行:
# 删除含"MIR"的行,不保存源文件
# sed -i '/MIR/d' test.txt
# 删除含"MIR"的行,但不改变文件本身,操作之后的结果可在终端显示或重定向
sed -e '/MIR/d' test.txt > test.refined
# 删除含字符串"MIR"或“LNC"的行,将结果保存到 test.refined
sed '/MIR/d;/LNC/d' a.txt > test.refined
保留表格中某列是特定值的行
如下所示,我只想保留转录本的注释信息,既第三列为transcript
的行
zcat hg19.ensGene.gtf.gz | head
chr1 ensGene transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "ENSG00000223972";
chr1 ensGene exon 11869 12227 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "1"; exon_id "ENST00000456328.1"; gene_name "ENSG00000223972";
chr1 ensGene exon 12613 12721 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "2"; exon_id "ENST00000456328.2"; gene_name "ENSG00000223972";
chr1 ensGene exon 13221 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "3"; exon_id "ENST00000456328.3"; gene_name "ENSG00000223972";
chr1 ensGene transcript 11872 14412 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; gene_name "ENSG00000223972";
chr1 ensGene exon 11872 12227 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; exon_number "1"; exon_id "ENST00000515242.1"; gene_name "ENSG00000223972";
chr1 ensGene exon 12613 12721 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; exon_number "2"; exon_id "ENST00000515242.2"; gene_name "ENSG00000223972";
chr1 ensGene exon 13225 14412 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; exon_number "3"; exon_id "ENST00000515242.3"; gene_name "ENSG00000223972";
chr1 ensGene transcript 11874 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000518655"; gene_name "ENSG00000223972";
chr1 ensGene exon 11874 12227 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000518655"; exon_number "1"; exon_id "ENST00000518655.1"; gene_name "ENSG00000223972";
awk '{if($3~/^transcript$/)print}' hg19.refGene.gtf > genesAnno.gtf
cat genesAnno.gtf | head
chr1 refGene transcript 11869 14362 . + . gene_id "LOC102725121"; transcript_id "NR_148357"; gene_name "LOC102725121";
chr1 refGene transcript 11874 14409 . + . gene_id "DDX11L1"; transcript_id "NR_046018"; gene_name "DDX11L1";
chr22 refGene transcript 24666799 24813706 . + . gene_id "SPECC1L"; transcript_id "NM_015330"; gene_name "SPECC1L";
chr1 refGene transcript 17369 17436 . - . gene_id "MIR6859-1"; transcript_id "NR_106918"; gene_name "MIR6859-1";
chr1 refGene transcript 17369 17436 . - . gene_id "MIR6859-2"; transcript_id "NR_107062"; gene_name "MIR6859-2";
chr1 refGene transcript 17369 17436 . - . gene_id "MIR6859-3"; transcript_id "NR_107063"; gene_name "MIR6859-3";
chr1 refGene transcript 17369 17436 . - . gene_id "MIR6859-4"; transcript_id "NR_128720"; gene_name "MIR6859-4";
chr1 refGene transcript 30366 30503 . + . gene_id "MIR1302-2"; transcript_id "NR_036051"; gene_name "MIR1302-2";
chr1 refGene transcript 30366 30503 . + . gene_id "MIR1302-9"; transcript_id "NR_036266"; gene_name "MIR1302-9";
chr1 refGene transcript 30366 30503 . + . gene_id "MIR1302-10"; transcript_id "NR_036267"; gene_name "MIR1302-10";