-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathseg-FrameNet-Semafor
More file actions
214 lines (172 loc) · 8.36 KB
/
seg-FrameNet-Semafor
File metadata and controls
214 lines (172 loc) · 8.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/bin/bash
#
# /usr/local/bin/seg-FrameNet-Semafor
#
# Calls FrameNet-06.py -- see also cc-integrate-rongda-segmentation
#
# Written by FFS 2014-08-04
#
# Changelog
#
# 2014-09-10 Rewrite for Semafor
# 2014-08-04 Forked from seg-PartsOfSpeech
#
#---------------------------------------------------------------------
# Parsing script and primary tag
ParseScript=FrameNet-06.py
PTAG=FRM_01
SRC=seg # Source file type
TMP=json # Temporary file type
TAR=frm # Target file type
SCRIPT=`basename $0`
# Help screen
if [ "$1" = "-h" -o "$1" = "--help" -o "$1" = "help" ]
then echo -e "\n\t$SCRIPT [<date> or #] [<partially matching filename>] [clobber or manual]\n"
echo -e "\tAnnotate semantic frames files using Semafor 3.0-alpha4 with FrameNet 1.5.\n"
echo -e "\tUse 'manual' to see verbose output -- requires ctrl-c to terminate hangs.\n"
echo -e "\tExamples:\n"
echo -e "\tProcess the .$SRC files from seven days ago:\n"
echo -e "\t\t$SCRIPT 7 \n"
echo -e "\tProcess only Aljazeera files from a given date, removing any pre-existing FMT codes:\n"
echo -e "\t\t$SCRIPT 2006-12-28 Aljazeera clobber\n"
echo -e "\tUse a for loop to process a series of days (see daysago 2004-06-01) -- _ matches all files:\n"
echo -e "\t\tsweep 2013-01-01 ; for i in {1..366} ; do $SCRIPT here _ clobber ; sweep + 1 ; done\n"
echo -e "\tThe current version of the script is experimental and creates new .$TAR files."
echo -e "\tThe files are processed in ca:/sweep, where they may be sync'd.\n"
echo -e "\tCodebook with example lemma:\n"
echo -e "\tStart time|End time|Primary tag|Token|Position|Frame name|Semantic Role Labeling|Token|Position|Frame element"
echo -e "\t20140901233124.883|20140901233139.031|FRM_01|DECISION|24-25|Deciding|SRL|HER|23-24|Cognizer\n"
exit
fi
# Start time
START="$(date +%s)"
# Hostname
HOST="$( hostname -s )"
# The FrameNet repository is currently only on cartago
if [ "$HOST" != "cartago" ] ; then echo -e "\n\tThe FrameNet repository is currently only on cartago.\n" ; exit ; fi
# Get the date to work on (today's date may change while the script is running)
if [ "$1" = "here" ] ; then DAY="$( pwd )" DAY=${DAY##*/}
elif [ -z "$1" ] ; then DAY="$(date +%F)"
elif [ "$(echo $1 | grep '[^0-9]')" = "" ] # if the first parameter is an integer
then DAY="$(date -d "-$1 day" +%F)"
else DAY="$1"
fi
# Sanity check
if [ "$(date -d "$DAY" 2>/dev/null)" = "" ]
then echo -e "\n\t$SCRIPT [<date> or #] [<partially matching filename>] [clobber]\n" ; exit
fi
# Partial file name?
if [ -z "$2" ]
then NAM="_"
else NAM="$2"
fi ; NAM=""$DAY"*"$NAM""
# Generate the date-dependent portion of the path
DDIR="$(date -d "$DAY" +%Y)/$(date -d "$DAY" +%Y-%m)/$(date -d "$DAY" +%F)"
# Base directory
if [ "$HOST" = "roma" ] ; then SDIR=/tv/$DDIR ; else SDIR=/sweep/$DDIR ; fi
# Define the trouble log and e-mail recipients
FAILED="/tmp/$SCRIPT.log"
TO=`cat /usr/local/bin/e-mail`
# File counter
NUM=0
# Welcome
echo -e "\n\tSemantic frame annotation with Semafor 3 in seg files for $DAY on $( date )\n"
# Process each text file in turn
for FIL in $( ls -1 $SDIR/$NAM*.$SRC 2> /dev/null ); do
# Strip path and extension
FIL=${FIL##*/} FIL=${FIL%.*}
# Skip KMEX
if [[ $FIL == *_KMEX_* ]] ; then continue ; fi
# Mark incomplete files
if [ -f $SDIR/$FIL.$TAR -a "$( tail -n1 $SDIR/$FIL.$TAR 2>/dev/null | grep END 2>/dev/null )" = "" ]
then mv $SDIR/$FIL.$TAR $SDIR/$FIL.$TAR-incomplete ; echo -e "\t\t\tIncomplete annotation -- see $FIL.$TAR-incomplete"
fi
# Check for existing $PTAG tags
if [ "$( egrep ^"$PTAG" $SDIR/$FIL.$TAR 2>/dev/null )" != "" ] ; then
if [ "$3" = "clobber" ] ; then
# Tweak as needed to identify the version considered up to date
if [ "$( egrep -m1 "$ParseScript" $SDIR/$FIL.$TAR | grep $PTAG 2>/dev/null )" != "" ]
then echo -e "\t\t\t$ParseScript annotation $PTAG present in $FIL.$TAR -- skipping" ; continue
else echo -en "\t\t\tRe-annotating $PTAG with $ParseScript $FIL.$SRC"
if [ "$SRC" = "$TAR" ]
then sed -i "/$PTAG/d" $SDIR/$FIL.$SRC
else rm -r $SDIR/$FIL.$TAR
fi
fi
else echo -e "\t\t\t$ParseScript $PTAG completed in $FIL.$TAR" ; continue
fi
else echo -en "\t\t\tAnnotating $PTAG with $ParseScript $FIL.$SRC"
fi
# Move on if the stripped file exists -- a crude reservation system
if [ -f $SDIR/$FIL.stripped ] ; then tput cr ; echo -e "\t\t\t$FIL.stripped exists -- skipping " ; continue ; fi
# Remove header, tags, and timestamps
grep CC1 $SDIR/$FIL.$SRC | sed -r 's/[A-Z0-9\.\|]{3,18}\|//g' > $SDIR/$FIL.stripped
# Remove chevrons
sed -i -r 's/^[>]{1,5}\ ?//g' $SDIR/$FIL.stripped
# Remove blank lines
sed -i '/^\s*$/d' $SDIR/$FIL.stripped
# Convert to UTF-8
iconv -f ISO-8859-1 -t UTF-8 $SDIR/$FIL.stripped | sponge $SDIR/$FIL.stripped
# Remove if empty
if [ $( stat -c%s $SDIR/$FIL.stripped ) = 0 ] ; then rm $SDIR/$FIL.stripped ; echo -e " -- no text" ; continue ; fi
# Remove any previous temporary file
rm -f $SDIR/$FIL.$TMP
# Get the size of the source file
S0="$( stat -c%s $SDIR/$FIL.$SRC 2>/dev/null )"
# Walk into the Semafor 3 directory to get the right environment (fixme)
cd /mnt/tvspare/software/java/semafor-experimental/semafor
# Annotate the text with Semafor -- source target threads -- and background it to catch hangs
if [ "$3" != "manual" ]
then ./bin/runSemafor.sh $SDIR/$FIL.stripped $SDIR/$FIL.$TMP 5 &>/dev/null &
# Get the PID and the start time of the annotation
PID=$! AGE="$( date +%s )" ; n=0 ; s=1
# Wait for the temporary file to start growing (first column margin counter)
while [ ! -s $SDIR/$FIL.$TMP -a $n -lt 999 ] ; do
tput cr ; echo -n $( echo "$n * $s" | bc | cut -d"." -f1 ) ; sleep $s ; n=$[n+1]
done
# If the file did not get created within the time allotted, move on
if [ ! -s $SDIR/$FIL.$TMP ] ; then
echo -e "\t`date +%F\ %H:%M` \t${SCRIPT%.*} \tNo json \t$LASTED secs \t$FIL.seg" >> $FAILED.$( date +%F )
echo -e "\t\t\tFAILED -- no json file created for $FIL.$SRC "
rm -f $SDIR/$FIL.stripped ; continue
fi ; S1=0 S2=0 n=0 ; tput sc
# Terminate if the file stops growing (second column margin counter)
while ps -p $PID > /dev/null ; do
S1="$( stat -c%s $SDIR/$FIL.$TMP 2>/dev/null )" ; sleep 1 ; NOW="$(date +%s)" ; LASTED="$[NOW-$AGE]"
S2="$( stat -c%s $SDIR/$FIL.$TMP 2>/dev/null )" n=$[n+1] ; tput cr ; tput ht ; echo -n $n
#echo -e "\nS1 is $S1 and S2 is $S2\n"
if [ "$S1" -eq "$S2" -a $n -gt 299 ] ; then kill $PID 3>&2 2>/dev/null
echo -e "\t`date +%F\ %H:%M` \t${SCRIPT%.*} \tNo grow \t$LASTED secs \t$FIL.$TMP" >> $FAILED.$( date +%F )
echo -e "\t\t\tFAILED -- json file not growing -- see $FIL.$TMP-incomplete "
mv $SDIR/$FIL.$TMP $SDIR/$FIL.$TMP-incomplete
fi
done ; NOW="$(date +%s)" LASTED="$[NOW-$AGE]" LASTED=$(date -ud "+$LASTED seconds"\ $(date +%F) +%M:%S)
# Receipt
tput rc ; tput ht ; tput ht ; echo "$LASTED"
# Verbose Semafor -- source target threads -- use ctrl-c to get past hangs
else ./bin/runSemafor.sh $SDIR/$FIL.stripped $SDIR/$FIL.$TMP 5
fi
# Convert the json file
if [ -f $SDIR/$FIL.$TMP ]
then $ParseScript $SDIR/$FIL.$SRC $SDIR/$FIL.$TMP > $SDIR/$FIL.$TAR
# Try again if it fails
if [ $( stat -c%s $SDIR/$FIL.$TAR ) -lt $( stat -c%s $SDIR/$FIL.$SRC ) ]
then $ParseScript $SDIR/$FIL.$SRC $SDIR/$FIL.$TMP > $SDIR/$FIL.$TAR
fi
# Verify the file is complete
if [ "$( tail -n1 $SDIR/$FIL.$TAR 2>/dev/null | grep END 2>/dev/null )" != "" ]
then NUM=$[NUM+1]
else mv $SDIR/$FIL.$TAR $SDIR/$FIL.$TAR-incomplete ; echo -e "\t\t\tIncomplete annotation -- please check $FIL.$TAR-incomplete"
fi
else echo -e "\t\t\tFailed annotation -- please check $FIL.$SRC"
fi
# Clean up
#rm -f $SDIR/$FIL.stripped $SDIR/$FIL.$TMP
done
# Sanity check
if [ "$FIL" = "" ] ; then echo -e "\tUse a partially matching file name -- leave out the date and the extension.\n" ; exit ; fi
# Completion time
NOW="$(date +%s)" LASTED="$[NOW - $START]" LASTED=$(date -ud "+$LASTED seconds"\ $(date +%F) +%H:%M:%S)
# Receipt
echo -e "\n\tCompleted annotating $NUM $SRC files to $TAR with $ParseScript in $SDIR ($LASTED)\n"
# EOF