织梦CMS - 轻松建站从此开始!

罗索

shell脚本(sed及awk的应用)

jackyhwei 发布于 2011-07-26 13:32 点击:次 
shell脚本中sed及awk的使用。
TAG:

IR4QA-splitqrels脚本

#!/bin/sh

# create topic directories under "current directory"
# and creates a qrels file(*.rel) for each topic.
# also create a list of topicIDs(*.tid) that are included in the original qrels

if [ $# -ne 1 ]; then
    echo "usage: `basename $0` <IR4QAqrels file>" >&2
    echo " e.g.: `basename $0` ACLIA1-JA.qrels" >&2
    exit 1
fi

QRELS=$1
shift

TIDSUF="tid"
#RELSUF="prel" # pseudorel
RELSUF="rel"

# create a topicid file from the qrel file

TIDFILE=$QRELS.$TIDSUF
cat $QRELS | sed 's/ .*$//' | uniq > $TIDFILE
echo "created $TIDFILE" >&2

# create a rel assessment file in each topic directory

# cat $TIDFILE | while read TID; do
while read TID; do

mkdir -p $TID
OUTF=$TID/$TID.$RELSUF

grep "^$TID " $QRELS | sed 's/^[^ ]* //' | sort -u > $OUTF
echo "created $OUTF" >&2

# done
done < $TIDFILE

IR4QA-splitruns脚本

#!/bin/sh

# break a single xml file in the ACLIA1 IR4QA submission format
# into per-topic ranked list files(*.res).

# IR4QA: break a IR4QA runfile into per-topic files
# truncate if there are more than 1000 docs

# makes empty files even if the runfile is empty for some topics

if [ $# -lt 1 ]; then
    echo "usage: `basename $0` <topicIDlist> [runpathlist]" >&2
    echo " e.g.: echo ./Runs/CMUJAV-EN-JA-01-T | `basename $0` ACLIA1-JA.qrels.tid" >&2
    exit 1
fi

#AWK=/usr/bin/gawk
AWK=awk

SUF="res"
DOCLIMIT=1000
# truncate if the run is too long

TIDLIST=$1
shift


# cat $1 | while read RUNPATH; do
RUNPATH=$1

RUN=`echo $RUNPATH | sed 's/^.*\///'`
# RUN=`basename $RUNPATH`

cat $TIDLIST | while read TID; do

OUTF=$TID/$TID.$RUN.$SUF

cat $RUNPATH | $AWK 'BEGIN{count=0}
/<TOPIC ID="'$TID'">/{ sw = 1; next }
sw==1{

if( match( $0, /<\/TOPIC>/ ) ){ exit }

if( match( $0, / 'DOCID='/ ) ){

sub( /^.* DOCID="/, "", $0 );
sub( /" .*$/, "", $0 );
print
count++;
if( count >= '$DOCLIMIT' ){ exit }

}

}
'
> $OUTF

# sed 's/_CMN_//' $OUTF > tmp
# mv tmp $OUTF
sed -i 's/_CMN_//' $OUTF

echo "created $OUTF" >&2

done

# done

IR4QA-qeval脚本

#!/bin/sh

# read the *.rel file and a *.res (ranked list) file for each topic
# and compute evaluation metrics(*.lab), by calling the C program q_eval.
# *.qev file contains per-topic performance values.
# also output performance values averaged across topics(to standard output).

if [ $# -lt 2 ]; then
    echo "usage: `basename $0` <topicIDfile> <evaluationname> [runlist]" >&2
    echo " e.g.: echo TEST-EN-JA-01-T | `basename $0` ACLIA1-JA.qrels.tid default" >&2
    exit 1
fi


# write your q_eval label/comp options here

LABELOPT=""

COMPOPT="-cutoffs 10,1000 -g 1:2"
# The above is for the ACLIA1 IR4QA collection which only has
# relevant and partially relevant docs.

#COMPOPT="-cutoffs 10,1000 -g 1:2:3"
# The above is for NTCIR collections with highly relevant / relevant
# / partially relevant docs.


# select the metrics you want here

#METRICS="RR O-measure P-measure P-plus AP Q-measure RBP MSnDCG@0010 MSnDCG@1000 P@0010 Hit@0010"

METRICS="AP Q-measure MSnDCG@1000"

OSUF="qev"


TIDFILE=$1
OUTSTR=$2
shift
shift

RESSUF="res"
RELSUF="rel"
LABSUF="lab"

#AWK=/usr/bin/gawk
AWK=awk

# set your own q_eval path here
QEVPATH=./q_eval

# number of topics
NTOPICS=`wc $TIDFILE | $AWK '{print $1}'`


# cat $1 | while read RUN; do
RUN=$1

OUTF=$RUN.$OUTSTR.$OSUF

cat $TIDFILE | while read TID; do

LABFILE=$TID/$TID.$RUN.$OUTSTR.$LABSUF

cat $TID/$TID.$RUN.$RESSUF |
$QEVPATH label -r $TID/$TID.$RELSUF $LABELOPT > $LABFILE
echo "created $LABFILE" >&2

cat $LABFILE |
$QEVPATH compute -r $TID/$TID.$RELSUF $COMPOPT -out $TID

done > $OUTF

echo "created $OUTF" >&2

echo "averaging over $NTOPICS topics:"
for MET in $METRICS; do

cat $OUTF | grep " ${MET}=" | $AWK 'BEGIN{sum=0}
{ sum += $NF }
END{ printf( "'$RUN' '$MET' %.4f\n", sum/'$NTOPICS' ) }
# note that dividing by NR is incorrect.
'

done

# done

run调用前三个脚本

#!/bin/sh

if [ $# -ne 1 ]; then
echo "usage: $0 <file.xml>" >&2
# echo "usage: `basename $0` <file.xml>" >&2
echo " e.g.: $0 OT-CS-CS-02-T.xml" >&2
exit 1
fi

MYFILE=$1

if [ -d result ]; then
rm -r result
fi

# create topic directories under "current directory"
# and creates a qrels file(*.rel) for each topic.
# also create a list of topicIDs(*.tid) that are included in the original qrels

#if [ -f ACLIA1-CS.qrels.tid ]; then
./IR4QA-splitqrels ACLIA1-CS.qrels
#fi

# break a single xml file in the ACLIA1 IR4QA submission format
# into per-topic ranked list files(*.res).

./IR4QA-splitruns ACLIA1-CS.qrels.tid $MYFILE

make

# read the *.rel file and a *.res (ranked list) file for each topic
# and compute evaluation metrics(*.lab), by calling the C program q_eval.
# *.qev file contains per-topic performance values.
# also output performance values averaged across topics(to standard output).

./IR4QA-qeval ACLIA1-CS.qrels.tid default $MYFILE

# mkdir -p result
# mv ACLIA1-CS-T* result/
# mv ACLIA1-CS.qrels.tid result/
# mv OT-CS-CS-05-T.xml.default.qev result/

(studyarea)
本站文章除注明转载外,均为本站原创或编译欢迎任何形式的转载,但请务必注明出处,尊重他人劳动,同学习共成长。转载请注明:文章转载自:罗索实验室 [http://www.rosoo.net/a/201107/14749.html]
本文出处:hi.baidu.com/studyarea 作者:studyarea
顶一下
(0)
0%
踩一下
(0)
0%
------分隔线----------------------------
发表评论
请自觉遵守互联网相关的政策法规,严禁发布色情、暴力、反动的言论。
评价:
表情:
用户名: 验证码:点击我更换图片
栏目列表
将本文分享到微信
织梦二维码生成器
推荐内容