bash、perl处理文件效率对比

场景：按照对应关系（对应关系存在一个文件中）每行读取替换文件中的指定字段，10万行

例如对应关系存在pairs.txt中：

ABABAB|CDCDCDCD

ACACAC|CACACACA

BBBBBB|CCCCCCCC

AAAAAA|BBBBBBBB

CCCCCC|DDDDDDDD

需要替换的文件contents.txt，需要替换第4个字段:

1|2|3|ABABAB|4

1|2|3|ACACAC|4

1|2|3|BBBBBB|4

1|2|3|ABABAB|4

1|2|3|AAAAAA|4

1|2|3|ABABAB|5

......

......{省略剩下的10W行+}

由于比较熟悉bash，第一反应是写个bash脚本处理：

#!/bin/bash

rm newfile.txt

#存放替换的键值对

declare -A keywords

keyfile=$1

transfile=$2

count=$3

if [ ! $# -eq 3 ]

then

echo "need 3 args"

exit -1

while read line

if [ ! "$line" = "" ]

then

keyval=(${line//|/ })

key=${keyval[0]}

val=${keyval[1]}

keywords[$key]=$val

done < $keyfile

mapfile myarr < $transfile

for line in ${myarr[@]}; do

numbers=(${line//|/ })

number=${numbers[$count]}

tmpkeyval="${keywords[$number]}"

if [ ! "$tmpkeyval" = "" ]

then

numbers[$count]=$tmpkeyval

newline=""

for var1 in ${numbers[@]}

if [ "${newline}" = "" ];then

newline=$var1

else

newline="${newline}|$var1"

done

echo "$newline" >> newfile.txt

done

统计一下执行时间，执行时间较长，38秒多：

[xxxx]$ time ./cuttingx.sh pairs.txt contents.txt 3

real 0m38.619s

user 0m34.861s

sys 0m3.705s

由于实际业务中要处理的文件是200W行以上，按照上述实验的估算，执行耗时不可接受。

同事说perl实现很快，学了一下基本语法，开干

#!/usr/bin/perl

if(@ARGV != 4){

print "need 3 args";

exit 1;

}

$transfile = shift @ARGV;

$srcfile = shift @ARGV;

$index = shift @ARGV;

$outfile = shift @ARGV;

%hashs;

open(DATA, "<$transfile") or die "can not open file $transfile: $!";

while(<DATA>){

chomp;

@keys = split /\|/;

$hashs{$keys[0]} = $keys[1];

}

close $transfile;

open(SRCDATA, "<$srcfile") or die "can not open file $srcfile: $!";

open(OUTFILE, ">>$outfile") or die "can not open file $outfile: $!";

while(<SRCDATA>) {

chomp;

@values = split /\|/;

$tmp = $values[$index];

if(exists $hashs{$tmp}) {

$values[$index] = $hashs{$tmp};

}

$newline = '';

$newline = join('|',@values);

print OUTFILE "$newline\n";

}

close $srcfile;

close $outfile;

统计一下执行时间，执行时间缩短了很多，和bash相比提升了n个级别

[xxxx]$ time ./trans.pl pairs.txt contents.txt 3 output.txt

real 0m0.574s

user 0m0.564s

sys 0m0.009s

最后用c++实现一遍，做个对比：

#include <iostream>

#include <map>

#include <string.h>

#include <stdio.h>

void split(char *src,const char *separator,char **dest,int *num);

using namespace std;

int main(int argc, char* args[]) {

FILE * keysfile = fopen(args[1], "r");

FILE * contents = fopen(args[2], "r");

FILE * outputfile = fopen(args[3], "rw+");

char readbuf[1024];

map<string, string> keys;

char *p[2] = {0};

char *p1[10] = {0};

int num = 0;

while(fgets(readbuf, 1024, keysfile)) {

split(readbuf, "|", p, &num);

keys.insert(pair<string, string>(p[0], p[1]));

}

while(fgets(readbuf, 1024, contents)) {

split(readbuf, "|", p1, &num);

string tmp = p1[3];

string outline = "";

if(keys.find(tmp) != keys.end()) {

//p1[3] = keys[tmp].c_str();

//printf("exsit:%s\n", keys[tmp].c_str());

for(int i = 0;i < num;i++) {

if(i == 0) {

outline = p1[0];

}

else if(i == 3) {

outline = outline + "|" + keys[tmp];

}else{

outline = outline + "|" + p1[i];

}

fputs(outline.c_str(), outputfile);

}else{

fputs(readbuf, outputfile);

}

fclose(keysfile);

fclose(contents);

fclose(outputfile);

return 0;

}

void split(char *src,const char *separator,char **dest,int *num) {

char *pNext;

int count = 0;

if (src == NULL || strlen(src) == 0)

return;

if (separator == NULL || strlen(separator) == 0)

return;

char *strtok(char *str, const char *delim);

pNext = strtok(src,separator);

while(pNext != NULL) {

*dest++ = pNext;

++count;

pNext = strtok(NULL,separator);

}

*num = count;

}

[xxx]# time ./cut pairs.txt contents.txt out.txt

real 0m0.325s

user 0m0.322s

sys 0m0.002s

原理：

Perl是一种类似basic的脚本语言。专业化一点来说，Perl是一种字节编译语言，并且还是一个字节解释器。它不会象unix中的shell读程序一样，对程序进行逐行执行。相反，Perl会先通读一遍文件，将其编译为内部表达式，然后执行指令。

虽然Perl是一种脚本语言，但是在所有的脚本语言中，它的执行速度可能是最快的。因为Perl本身是采用C语言开发，很多模块也是使用C语言开发的。换句话说，Perl执行某项指令可能是直接调用C语言开发的函数。

在编译的同时，也进行了一些代码的优化，例如，消除了不可能执行的代码，计算了常量表达式，加载了库定义。

bash、perl处理文件效率对比

bash、perl处理文件效率对比

相关阅读更多精彩内容

友情链接更多精彩内容