批量运行fastp
ls ../raw_rna-seq/*_1.fastq.gz | xargs -I file basename file _1.fastq.gz | xargs -I file sh -c "echo ' fastp -i ../raw_rna-seq/file_1.fastq.gz -I ../raw_rna-seq/file_2.fastq.gz -o file_1.clean.fastq.gz -O file_2.clean.fastq.gz -w 10 --length_required 90' " > cmds
统计遗传距离
cat *.data | sort -g | awk 'NR==1{print "min:", $1} END{print "max:", $1}'
awk '$1=="S"{sum+=length($3)} END{print sum}'
grep WhR_A merge.haplotype40.fasta.gz.fai | awk '{sum+=$2} END{print sum}'
rg -c "^S" *gfa
vcftools --gzvcf combined.vcf.gz --keep-only-indels --out combine.sv --recode --recode-INFO-all
vcftools --gzvcf merged_rename.vcf.gz --remove-indels --recode --recode-INFO-all --out snp
python ~/split_vcf_by_length.py sv.recode.vcf
cut -f 3 pggb.histgrowth.node.tsv | sed -n '7,$p' > growth.list
python ~/pangenome_curve.py growth.list > growth.list.stats
awk -F'\t' '
/^P/ {
n=split($3, a, ",");
print $2, n
}
' <(zcat s20k.graphs.combined.gfa.gz) > ~/workspace/pangenome/super_sugarcane_growth/stats.node
for stats_file in *.stats; do filename=$(basename "$stats_file"); total_aligned=$(grep "Total aligned:" "$stats_file" | awk '{print $3}'); total_primary=$(grep "Total primary:" "$stats_file" | awk '{print $3}'); alignment_rate=$(echo "scale=6; $total_aligned / $total_primary * 100" | bc); printf "$alignment_rate\n"; done
for f in *.bam.stats; do perc=$(grep "mapped (" "$f" | head -n1 | awk -F'[()]' '{print $2}' | awk '{print $1}'); echo -e "${perc}"; done
grep "Total alignments" *stats | cut -f 3 -d ":"
#Overall
grep "concordantly exactly 1 time" index.sh.o222447 | grep -oP '\(\K[0-9]+\.[0-9]+(?=%\))'
#
grep "overall alignment rate" *map.* | grep -oP '[0-9]+\.[0-9]+(?=%)'
for i in *gam; do echo "vg stats -a ${i} > ${i}.stats";done > cmds
for i in *gam; do echo "vg view -a ${i} | jq -c 'select(.mapping_quality | not)' | wc -l > ${i}.uniq" ;done >>cmds