| | use File::Slurp;
|
| |
|
| | use utf8;
|
| |
|
| | if(scalar(@ARGV)!=2)
|
| | {
|
| | print "\tArgv1 --> input text file\n";
|
| | print "\tArgv2 --> output text file\n";
|
| | exit(0);
|
| | }
|
| | open(file,">",@ARGV[1]);
|
| | @english = split("\n",read_file("lists/english"));
|
| | @spl_chr = split("\n",read_file("lists/spl_chr"));
|
| | @tamil = split("\n",read_file("lists/tamil"));
|
| | @number_file = split("\n",read_file("lists/number"));
|
| | @text = split("",read_file(@ARGV[0]));
|
| | $eng_cnt = 0;
|
| | $spl_cnt = 0;
|
| | $space = 0;
|
| | $tamil_cnt = 0;
|
| | $num_cnt = 0;
|
| | $i=0;
|
| | Foreach1: foreach $txt(@text)
|
| | {
|
| | if($txt ne "(" && $txt ne ")" && $txt ne "\\" && $txt ne "." && $txt ne "^" && $txt ne "*" && $txt ne "+" && $txt ne "?" && $txt ne "{" && $txt ne "}" && $txt ne "[" && $txt ne "]" && $txt ne "|")
|
| | {
|
| | @arr = grep(/^$txt$/,@english);
|
| | @arr1 = grep(/^$txt$/,@spl_chr);
|
| | @arr2 = grep(/^$txt$/,@tamil);
|
| | @arr3 = grep(/^$txt$/,@number_file);
|
| | }
|
| | else
|
| | {
|
| | @arr =[];@arr1=[];@arr2=[];
|
| | $spl_cnt++;
|
| | next Foreach1;
|
| | }
|
| | if($txt eq "\$")
|
| | {
|
| | $spl_cnt++;
|
| | }
|
| | elsif($txt eq " " || $txt eq "\t" || $txt eq "\n")
|
| | {
|
| | $space++;
|
| | }
|
| | elsif(scalar(@arr)!=0)
|
| | {
|
| | $eng_cnt++;
|
| | }
|
| | elsif(scalar(@arr1)!=0 || $txt eq "-" || $txt eq "/" || $txt eq "\\")
|
| | {
|
| | $spl_cnt++;
|
| |
|
| | }
|
| | elsif(scalar(@arr3)!=0)
|
| | {
|
| | $num_cnt++;
|
| | }
|
| | else
|
| | {
|
| |
|
| | $tamil_cnt++;
|
| | }
|
| | $i++;
|
| |
|
| | }
|
| |
|
| | @spl_chr_map = split("\n",read_file("lists/spl_chr_map"));
|
| |
|
| | for($j=0; $j<scalar(@text); $j++)
|
| | {
|
| | $txt = @text[$j];
|
| | $prev = @text[$j-1];
|
| | $nxt = @text[$j+1];
|
| | if($txt ne "(" && $txt ne ")" && $txt ne "\\" && $txt ne "^" && $txt ne "*" && $txt ne "+" && $txt ne "?" && $txt ne "{" && $txt ne "}" && $txt ne "[" && $txt ne "]" && $txt ne "|")
|
| | {
|
| | @arr = grep(/^$txt$/,@english);
|
| | @arr1 = grep(/^$txt /,@spl_chr_map);
|
| | @num_arr = grep(/\b$txt\b/,@number_file);
|
| | }
|
| | else
|
| | {
|
| | @arr = [];
|
| | @arr1 = [];
|
| | @num_arr = [];
|
| |
|
| | }
|
| | if($txt eq "\$")
|
| | {
|
| | if($tamil_cnt > 0)
|
| | {
|
| | print file " டாலர் ";
|
| | }
|
| | elsif($eng_cnt > 0)
|
| | {
|
| | print file " dollar ";
|
| | }
|
| | elsif($eng_cnt == 0 && $tamil_cnt == 0)
|
| | {
|
| | print file " டாலர் ";
|
| | }
|
| | }
|
| | elsif($txt eq "\+")
|
| | {
|
| | if($tamil_cnt > 0)
|
| | {
|
| | print file " ப்ளஸ் ";
|
| | }
|
| | elsif($eng_cnt > 0)
|
| | {
|
| | print file " plus ";
|
| | }
|
| | elsif($eng_cnt == 0 && $tamil_cnt == 0)
|
| | {
|
| | print file " ப்ளஸ் ";
|
| | }
|
| | }
|
| | elsif($txt eq "(" || $txt eq ")" || $txt eq "\\" || $txt eq "^" || $txt eq "*" || $txt eq "?" || $txt eq "{" || $txt eq "}" || $txt eq "[" || $txt eq "]" || $txt eq "|")
|
| | {
|
| | print file " ";
|
| | }
|
| | elsif($txt eq " " || $txt eq "\t" || $txt eq "\n")
|
| | {
|
| | print file " ";
|
| | }
|
| | elsif($txt eq ".")
|
| | {
|
| |
|
| | print file "$txt";
|
| | }
|
| | elsif($txt eq ',')
|
| | {
|
| |
|
| |
|
| |
|
| |
|
| | if($prev =~ /^\d+?$/ && $nxt =~ /^\d+?$/)
|
| | {
|
| |
|
| | }
|
| | else
|
| | {
|
| | print file "$txt";
|
| | }
|
| | }
|
| | elsif($txt =~ /^\d+?$/ && (($prev ne "/" && $nxt ne "/") && ($prev ne "-" && $nxt ne "-") && ($prev ne "." && $nxt ne ".")))
|
| | {
|
| | if($prev =~ /^\d+?$/ && $nxt =~ /^\d+?$/)
|
| | {
|
| | print file "$txt";
|
| | }
|
| | elsif($prev =~ /^\d+?$/ && ($nxt eq "." || $nxt eq ","))
|
| | {
|
| | print file "$txt";
|
| | }
|
| | elsif(($prev eq "." || $prev eq ",") && $nxt =~ /^\d+?$/)
|
| | {
|
| | print file "$txt";
|
| | }
|
| | elsif($nxt eq "." || $nxt eq ",")
|
| | {
|
| | print file " $txt";
|
| | }
|
| | elsif($prev eq "." || $prev eq ",")
|
| | {
|
| | print file "$txt";
|
| | }
|
| | elsif($prev !=~ /^\d+?$/ && $nxt =~ /^\d+?$/ )
|
| | {
|
| | print file "$txt";
|
| | }
|
| | elsif(($next !=~ /^\d+?$/ && $next ne "." && $next ne ",") )
|
| | {
|
| | print file "$txt ";
|
| | }
|
| | elsif(($prev !=~ /^\d+?$/ && $prev ne "." && $prev ne ",") )
|
| | {
|
| | print file "$txt ";
|
| | }
|
| | else
|
| | {
|
| | print file "$txt";
|
| | }
|
| |
|
| | }
|
| | elsif(scalar(@arr)!=0)
|
| | {
|
| | print file $txt;
|
| | }
|
| | elsif(scalar(@arr1)!=0)
|
| | {
|
| | @map = split(/\s+/,@arr1[0]);
|
| | if($tamil_cnt > 0)
|
| | {
|
| | if(scalar(@map)==1)
|
| | {
|
| | print file " ";
|
| | }
|
| | elsif(scalar(@map)==2)
|
| | {
|
| | print file "@map[1]";
|
| | }
|
| | elsif(scalar(@map)==3)
|
| | {
|
| | print file " @map[2] ";
|
| | }
|
| | elsif(scalar(@map)==4)
|
| | {
|
| | print file " @map[3] ";
|
| | }
|
| | elsif(scalar(@map)==5)
|
| | {
|
| | print file " @map[3] @map[4] ";
|
| | }
|
| | if(@map[1] eq "rupees")
|
| | {
|
| | $j = $j+2;
|
| | }
|
| | }
|
| | elsif($eng_cnt > 0)
|
| | {
|
| | if(scalar(@map)==1)
|
| | {
|
| | print file " ";
|
| | }
|
| | elsif(scalar(@map)==2)
|
| | {
|
| | print file "@map[1]";
|
| | }
|
| | elsif(scalar(@map)==3)
|
| | {
|
| | print file " @map[1] ";
|
| | }
|
| | elsif(scalar(@map)==4)
|
| | {
|
| | print file " @map[1] @map[2] ";
|
| | }
|
| | elsif(scalar(@map)==5)
|
| | {
|
| | print file " @map[1] @map[2] ";
|
| | }
|
| | if(@map[1] eq "rupees")
|
| | {
|
| | $j = $j+2;
|
| | }
|
| | }
|
| | elsif($eng_cnt == 0 && $tamil_cnt == 0)
|
| | {
|
| | if(scalar(@map)==1)
|
| | {
|
| | print file " ";
|
| | }
|
| | elsif(scalar(@map)==2)
|
| | {
|
| | print file "@map[0]";
|
| | }
|
| | elsif(scalar(@map)==3)
|
| | {
|
| | print file " @map[0] ";
|
| | }
|
| | elsif(scalar(@map)==4)
|
| | {
|
| | print file " @map[0] ";
|
| | }
|
| | elsif(scalar(@map)==5)
|
| | {
|
| | print file " @map[0] ";
|
| | }
|
| | if(@map[1] eq "rupees")
|
| | {
|
| | $j = $j+2;
|
| | }
|
| | }
|
| | }
|
| | else
|
| | {
|
| | print file "$txt";
|
| | }
|
| | }
|
| | close(file);
|
| |
|
| |
|
| | @tmp = split(/\s+/,read_file(@ARGV[1]));
|
| | open(file,">",@ARGV[1]);
|
| | if(@tmp[0] eq "")
|
| | {
|
| | $i=1;
|
| | }
|
| | else
|
| | {
|
| | $i=0;
|
| | }
|
| | while($i<scalar(@tmp))
|
| | {
|
| | $temp="";
|
| | $word = @tmp[$i];
|
| |
|
| | @wd = split("",$word);
|
| | if(@wd[0] =~ /^\d+$/)
|
| | {
|
| | @wd_c = split(/,/,$word);
|
| | foreach $digit (@wd_c)
|
| | {
|
| |
|
| | $temp.=$digit;
|
| | }
|
| | if(@wd[scalar(@wd)-1] eq ',')
|
| | {
|
| |
|
| | $temp.=", ";
|
| | }
|
| | else
|
| | {
|
| |
|
| |
|
| | }
|
| | $word = $temp;
|
| | }
|
| |
|
| | if($word =~ m!^(\d+)[- /.](\d+)[- /.](\d+)$!)
|
| | {
|
| |
|
| | if($1 gt 12 && $2 gt 12)
|
| | {
|
| | print file "$1 $2 $3 ";
|
| | }
|
| | elsif($2 > 12)
|
| | {
|
| | print file "$2/$1/$3 ";
|
| | }
|
| | elsif($1 > 12)
|
| | {
|
| | print file "$1/$2/$3 ";
|
| | }
|
| | else
|
| | {
|
| | print file "$1/$2/$3 ";
|
| | }
|
| | }
|
| | elsif($word =~ m!^(\d+)[-](\d+)$!)
|
| | {
|
| | print file "$1 - $2 ";
|
| | }
|
| | elsif($word =~ m!^(\d+)[.](\d+)$!)
|
| | {
|
| | if($tamil_cnt > 0)
|
| | {
|
| |
|
| |
|
| | $sep_num = join(" ",split("",$2));
|
| |
|
| | }
|
| | elsif($eng_cnt == 0 && $tamil_cnt ==0 )
|
| | {
|
| |
|
| |
|
| | $sep_num = join(" ",split("",$2));
|
| |
|
| | }
|
| | elsif($tamil_cnt == 0)
|
| | {
|
| |
|
| |
|
| | $sep_num = join(" ",split("",$2));
|
| |
|
| | }
|
| | else
|
| | {
|
| |
|
| | $sep_num = join(" ",split("",$2));
|
| |
|
| | }
|
| | print file "$word ";
|
| | }
|
| | elsif($word =~ m/(\d+)/ && @wd[0] eq ".")
|
| | {
|
| | print file "$word ";
|
| | }
|
| | elsif($word =~ m/(\d+)/)
|
| | {
|
| |
|
| | $used = $1;
|
| | $word =~ s/$used/ $used /g;
|
| | print file "$word ";
|
| | }
|
| | else
|
| | {
|
| | print file "$word ";
|
| | }
|
| |
|
| | $i++;
|
| | }
|
| | close(file);
|
| |
|
| | `perl -pi -e 's/-/ - /g;' @ARGV[1]`;
|
| |
|
| |
|
| |
|
| |
|
| | @tmp = split(/\s+/,read_file(@ARGV[1]));
|
| | open(file,">",@ARGV[1]);
|
| | if(@tmp[0] eq "")
|
| | {
|
| | $i=1;
|
| | }
|
| | else
|
| | {
|
| | $i=0;
|
| | }
|
| | while($i<scalar(@tmp))
|
| | {
|
| | $temp="";
|
| | $word = @tmp[$i];
|
| |
|
| | @wd = split("",$word);
|
| | if(@wd[0] =~ /^\d+$/)
|
| | {
|
| | @wd_c = split(/,/,$word);
|
| | foreach $digit (@wd_c)
|
| | {
|
| |
|
| | $temp.=$digit;
|
| | }
|
| | if(@wd[scalar(@wd)-1] eq ',')
|
| | {
|
| |
|
| | $temp.=", ";
|
| | }
|
| | else
|
| | {
|
| |
|
| |
|
| | }
|
| | $word = $temp;
|
| | }
|
| |
|
| | if($word =~ m!^(\d+)[- /.](\d+)[- /.](\d+)$!)
|
| | {
|
| |
|
| | if($1 gt 12 && $2 gt 12)
|
| | {
|
| | print file "$1 $2 $3 ";
|
| | }
|
| | elsif($2 > 12)
|
| | {
|
| | print file "$2/$1/$3 ";
|
| | }
|
| | elsif($1 > 12)
|
| | {
|
| | print file "$1/$2/$3 ";
|
| |
|
| | }
|
| | else
|
| | {
|
| | print file "$1/$2/$3 ";
|
| | }
|
| | }
|
| | elsif($word =~ m!^(\d+)[-](\d+)$!)
|
| | {
|
| | print file "$1 $2 ";
|
| | }
|
| | elsif($word =~ m!^(\d+)[.](\d+)$!)
|
| | {
|
| | print file "$word ";
|
| | }
|
| | elsif($word =~ m/(\d+)/ && @wd[0] eq ".")
|
| | {
|
| | print file "$word ";
|
| | }
|
| | elsif($word =~ m/(\d+)/)
|
| | {
|
| | $used = $1;
|
| | $word =~ s/$used/ $used /g;
|
| | print file "$word ";
|
| | }
|
| | else
|
| | {
|
| | print file "$word ";
|
| | }
|
| |
|
| | $i++;
|
| | }
|
| | close(file);
|
| |
|
| | @tmp = split(/\s+/,read_file(@ARGV[1]));
|
| | open(file,">",@ARGV[1]);
|
| | if(@tmp[0] eq "")
|
| | {
|
| | $i=1;
|
| | }
|
| | else
|
| | {
|
| | $i=0;
|
| | }
|
| | while($i<scalar(@tmp))
|
| | {
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | print file "@tmp[$i] ";
|
| |
|
| | $i++;
|
| | }
|
| | close(file);
|
| | `perl scripts/replace_dot_by_sil.pl @ARGV[1]`;
|
| |
|