-
Notifications
You must be signed in to change notification settings - Fork 0
/
PrepareTEData.pl
82 lines (67 loc) · 2.6 KB
/
PrepareTEData.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/perl
#===========File: PrepareTEData.pl===============
#Title: PrepareTEData.pl - Preprocess a Term Annotated Plaintext Document for CollTerm Testing.
#Description: POS tags a term annotated plaintext document and produces output data in a tokenized tab-separated format including term tags.
#Author: Kārlis Gediņš, SIA Tilde.
#Created: May, 2011.
#Last Changes: 29.07.2011. by Mārcis Pinnis, SIA Tilde.
#===============================================
use strict;
use warnings;
BEGIN
{
use FindBin '$Bin'; #Gets the path of this file.
push @INC, "$Bin"; #Add this path to places where perl is searching for modules.
}
#Checking if all required parameters are set.
if (not(defined($ARGV[0])&&defined($ARGV[1])&&defined($ARGV[2])&&defined($ARGV[3])))
{
print STDERR "usage perl PrepareTEData.pl [ARGS]\nARGS:\n\t1. [Language] - The tagger language (en|lv|et..).\n\t2. [POS Tagger] - The POS tagger to use (POS|Tree|Tagger).\n\t3. [Input File] - the path to the input file.\n\t4. [Output File] - The path to the output file\n\t5. [Delete temp files] - \"-D\" to delete temporary files (optional).\n";
die;
}
my $FullIputfilename = $ARGV[2];
#Defines the location where to write temporary files.
my $outputDir = $ARGV[3];
if ($outputDir =~ /[\\\/]/)
{
$outputDir =~ s/\\/\//gi;
$outputDir =~ s/\/[^\/]+$//g;
$outputDir .= "\/data\/";
}
else
{
$outputDir = "data\/";
}
#Creates temp data directory if it does not exist.
unless(-d $outputDir){mkdir $outputDir or die "[PrepareTEData] Cannot find nor create output directory \"$outputDir\".";}
use TEPreprocess;
my $Iputfilename = $FullIputfilename;
$Iputfilename=~ s/(.*)(\.[^\.]+$)/$1/g;
$Iputfilename=~ s/\\/\//gi;
$Iputfilename=~ s/.*\/([^\/]+)/$1/g;
my $del =0;
if($ARGV[4]) #Specifies, whether to delete temporary data files.
{
if($ARGV[4] eq "-D")
{
$del=1;
}
}
#Splitting tags and plaintext in two separate documents.
TEPreprocess::Detagger( "$FullIputfilename", "$outputDir$Iputfilename.plain", "$outputDir$Iputfilename.tags",);
use Tag;
my $pie;
#POS-tags the plaintext document.
Tag::TagText($ARGV[0],$ARGV[1], "$outputDir$Iputfilename.plain", "$outputDir$Iputfilename.POS",$del,1);
#Combines the term tags with the POS-tagged document
TEPreprocess::AddNewTags( "$outputDir$Iputfilename.POS", "$outputDir$Iputfilename.tags" , "$ARGV[3]", "1" );
#Deletes the temporary files and the temporary directory if required.
if($del)
{
unlink ("$outputDir$Iputfilename.plain");
unlink ("$outputDir$Iputfilename.taggs");
unlink ("$outputDir$Iputfilename.POS");
unlink ("$outputDir$Iputfilename.tags");
rmdir ("$outputDir");
}
exit;