| |
| |
| |
| |
| |
|
|
| package NLP::stringDistance; |
|
|
| use List::Util qw(min max); |
| $utf8 = NLP::UTF8; |
| $util = NLP::utilities; |
| $romanizer = NLP::Romanizer; |
|
|
| %dummy_ht = (); |
|
|
| sub rule_string_expansion { |
| local($this, *ht, $s, $lang_code) = @_; |
| |
| my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht); |
| foreach $sub_len ((0 .. ($#characters-1))) { |
| my $sub = join("", @characters[0 .. $sub_len]); |
| foreach $super_len ((($sub_len + 1) .. $#characters)) { |
| my $super = join("", @characters[0 .. $super_len]); |
| |
| $ht{RULE_STRING_EXPANSION}->{$lang_code}->{$sub}->{$super} = 1; |
| $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$sub} = 1; |
| |
| } |
| } |
| } |
|
|
| sub load_string_distance_data { |
| local($this, $filename, *ht, $verbose) = @_; |
|
|
| $verbose = 0 unless defined($verbose); |
| open(IN,$filename) || die "Could not open $filename"; |
| my $line_number = 0; |
| my $n_cost_rules = 0; |
| while (<IN>) { |
| $line_number++; |
| my $line = $_; |
| $line =~ s/^\xEF\xBB\xBF//; |
| $line =~ s/\s*$//; |
| next if $line =~ /^\s*(\#.*)?$/; |
| print STDERR "** Warning: line $line_number contains suspicious control character: $line\n" if $line =~ /[\x00-\x1F]/; |
| my $s1 = $util->slot_value_in_double_colon_del_list($line, "s1"); |
| my $s2 = $util->slot_value_in_double_colon_del_list($line, "s2"); |
| $s1 = $util->dequote_string($s1); |
| $s2 = $util->dequote_string($s2); |
| my $cost = $util->slot_value_in_double_colon_del_list($line, "cost"); |
| if (($s1 eq "") && ($s2 eq "")) { |
| print STDERR "Ignoring bad line $line_number in $filename, because both s1 and s2 are empty strings\n"; |
| next; |
| } |
| unless ($cost =~ /^\d+(\.\d+)?$/) { |
| if ($cost eq "") { |
| print STDERR "Ignoring bad line $line_number in $filename, because of missing cost\n"; |
| } else { |
| print STDERR "Ignoring bad line $line_number in $filename, because of ill-formed cost $cost\n"; |
| } |
| next; |
| } |
| my $lang_code1_s = $util->slot_value_in_double_colon_del_list($line, "lc1"); |
| my $lang_code2_s = $util->slot_value_in_double_colon_del_list($line, "lc2"); |
| my @lang_codes_1 = ($lang_code1_s eq "") ? ("") : split(/,\s*/, $lang_code1_s); |
| my @lang_codes_2 = ($lang_code2_s eq "") ? ("") : split(/,\s*/, $lang_code2_s); |
| my $left_context1 = $util->slot_value_in_double_colon_del_list($line, "left1"); |
| my $left_context2 = $util->slot_value_in_double_colon_del_list($line, "left2"); |
| my $right_context1 = $util->slot_value_in_double_colon_del_list($line, "right1"); |
| my $right_context2 = $util->slot_value_in_double_colon_del_list($line, "right2"); |
| my $bad_left = $util->slot_value_in_double_colon_del_list($line, "left"); |
| if ($bad_left) { |
| print STDERR "** Warning: slot '::left $bad_left' in line $line_number\n"; |
| next; |
| } |
| my $bad_right = $util->slot_value_in_double_colon_del_list($line, "right"); |
| if ($bad_right) { |
| print STDERR "** Warning: slot '::right $bad_right' in line $line_number\n"; |
| next; |
| } |
| my $in_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "in-lc1"); |
| my $in_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "in-lc2"); |
| my $out_lang_codes1 = $util->slot_value_in_double_colon_del_list($line, "out-lc1"); |
| my $out_lang_codes2 = $util->slot_value_in_double_colon_del_list($line, "out-lc2"); |
| if ($left_context1) { |
| if ($left_context1 =~ /^\/.*\/$/) { |
| $left_context1 =~ s/^\///; |
| $left_context1 =~ s/\/$//; |
| } else { |
| print STDERR "Ignoring unrecognized non-regular-express ::left1 $left_context1 in $line_number of $filename\n"; |
| $left_context1 = ""; |
| } |
| } |
| if ($left_context2) { |
| if ($left_context2 =~ /^\/.*\/$/) { |
| $left_context2 =~ s/^\///; |
| $left_context2 =~ s/\/$//; |
| } else { |
| $left_context2 = ""; |
| print STDERR "Ignoring unrecognized non-regular-express ::left2 $left_context2 in $line_number of $filename\n"; |
| } |
| } |
| if ($right_context1) { |
| unless ($right_context1 =~ /^(\[[^\[\]]*\])+$/) { |
| $right_context1 = ""; |
| print STDERR "Ignoring unrecognized right-context ::right1 $right_context1 in $line_number of $filename\n"; |
| } |
| } |
| if ($right_context2) { |
| unless ($right_context2 =~ /^(\[[^\[\]]*\])+$/) { |
| $right_context2 = ""; |
| print STDERR "Ignoring unrecognized right-context ::right2 $right_context2 in $line_number of $filename\n"; |
| } |
| } |
| foreach $lang_code1 (@lang_codes_1) { |
| foreach $lang_code2 (@lang_codes_2) { |
| $n_cost_rules++; |
| my $cost_rule_id = $n_cost_rules; |
| $ht{COST}->{$lang_code1}->{$lang_code2}->{$s1}->{$s2}->{$cost_rule_id} = $cost; |
| $ht{RULE_STRING}->{$lang_code1}->{$s1} = 1; |
| $ht{RULE_STRING}->{$lang_code2}->{$s2} = 1; |
| $ht{LEFT1}->{$cost_rule_id} = $left_context1; |
| $ht{LEFT2}->{$cost_rule_id} = $left_context2; |
| $ht{RIGHT1}->{$cost_rule_id} = $right_context1; |
| $ht{RIGHT2}->{$cost_rule_id} = $right_context2; |
| $ht{INLC1}->{$cost_rule_id} = $in_lang_codes1; |
| $ht{INLC2}->{$cost_rule_id} = $in_lang_codes2; |
| $ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes1; |
| $ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes2; |
| unless (($s1 eq $s2) |
| && ($lang_code1 eq $lang_code2) |
| && ($left_context1 eq $left_context2) |
| && ($right_context1 eq $right_context2) |
| && ($in_lang_codes1 eq $in_lang_codes2) |
| && ($out_lang_codes1 eq $out_lang_codes2)) { |
| $n_cost_rules++; |
| $cost_rule_id = $n_cost_rules; |
| $ht{COST}->{$lang_code2}->{$lang_code1}->{$s2}->{$s1}->{$cost_rule_id} = $cost; |
| $ht{LEFT1}->{$cost_rule_id} = $left_context2; |
| $ht{LEFT2}->{$cost_rule_id} = $left_context1; |
| $ht{RIGHT1}->{$cost_rule_id} = $right_context2; |
| $ht{RIGHT2}->{$cost_rule_id} = $right_context1; |
| $ht{INLC1}->{$cost_rule_id} = $in_lang_codes2; |
| $ht{INLC2}->{$cost_rule_id} = $in_lang_codes1; |
| $ht{OUTLC1}->{$cost_rule_id} = $out_lang_codes2; |
| $ht{OUTLC2}->{$cost_rule_id} = $out_lang_codes1; |
| |
| } |
| $this->rule_string_expansion(*ht, $s1, $lang_code1); |
| $this->rule_string_expansion(*ht, $s2, $lang_code2); |
| } |
| } |
| } |
| close(IN); |
| print STDERR "Read in $n_cost_rules rules from $line_number lines in $filename\n" if $verbose; |
| } |
|
|
| sub romanized_string_to_simple_chart { |
| local($this, $s, *chart_ht) = @_; |
|
|
| my @characters = $utf8->split_into_utf8_characters($s, "return only chars, return trailing whitespaces", *dummy_ht); |
| $chart_ht{N_CHARS} = $#characters + 1; |
| $chart_ht{N_NODES} = 0; |
| foreach $i ((0 .. $#characters)) { |
| $romanizer->add_node($characters[$i], $i, ($i+1), *chart_ht, "", ""); |
| } |
| } |
|
|
| sub linearize_chart_points { |
| local($this, *chart_ht, $chart_id, *sd_ht, $verbose) = @_; |
|
|
| $verbose = 0 unless defined($verbose); |
| print STDERR "Linearize $chart_id\n" if $verbose; |
| my $current_chart_pos = 0; |
| my $current_linear_chart_pos = 0; |
| $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos; |
| $sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos; |
| print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose; |
| my @end_chart_positions = keys %{$chart_ht{NODES_ENDING_AT}}; |
| my $end_chart_pos = (@end_chart_positions) ? max(@end_chart_positions) : 0; |
| $sd_ht{MAXPOS}->{$chart_id} = $end_chart_pos; |
| print STDERR " Chart span: $current_chart_pos-$end_chart_pos\n" if $verbose; |
| while ($current_chart_pos < $end_chart_pos) { |
| my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}}; |
| foreach $node_id (@node_ids) { |
| my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id}; |
| my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht); |
| print STDERR " $current_chart_pos/$current_linear_chart_pos node: $node_id $roman_s (@roman_chars)\n" if $verbose; |
| if ($#roman_chars >= 1) { |
| foreach $i ((1 .. $#roman_chars)) { |
| $current_linear_chart_pos++; |
| $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i} = $current_linear_chart_pos; |
| $sd_ht{LINPOS2SPLITPOS}->{$chart_id}->{$current_linear_chart_pos}->{$current_chart_pos}->{$node_id}->{$i} = 1; |
| print STDERR " LINPOS2SPLITPOS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id I: $i\n" if $verbose; |
| } |
| } |
| } |
| $current_chart_pos++; |
| if ($util->member($current_chart_pos, @end_chart_positions)) { |
| $current_linear_chart_pos++; |
| $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} = $current_linear_chart_pos; |
| $sd_ht{LINPOS2POS}->{$chart_id}->{$current_linear_chart_pos} = $current_chart_pos; |
| print STDERR " LINPOS2POS.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos\n" if $verbose; |
| } |
| } |
| $current_chart_pos = 0; |
| while ($current_chart_pos <= $end_chart_pos) { |
| my $current_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos}; |
| $current_linear_chart_pos = "?" unless defined($current_linear_chart_pos); |
| my @node_ids = keys %{$chart_ht{NODES_STARTING_AT}->{$current_chart_pos}}; |
| |
| foreach $node_id (@node_ids) { |
| my $end_pos = $chart_ht{NODE_END}->{$node_id}; |
| my $end_linpos = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_pos}; |
| my $roman_s = $chart_ht{NODE_ROMAN}->{$node_id}; |
| my @roman_chars = $utf8->split_into_utf8_characters($roman_s, "return only chars, return trailing whitespaces", *dummy_ht); |
| print STDERR " LINROM.$chart_id LIN: $current_linear_chart_pos POS: $current_chart_pos NODE: $node_id CHARS: @roman_chars\n" if $verbose; |
| if (@roman_chars) { |
| foreach $i ((0 .. $#roman_chars)) { |
| my $from_linear_chart_pos |
| = (($i == 0) |
| ? $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos} |
| : $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{$i}); |
| print STDERR " FROM.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $from_linear_chart_pos\n" if $verbose; |
| my $to_linear_chart_pos |
| = (($i == $#roman_chars) |
| ? $end_linpos |
| : $sd_ht{SPLITPOS2LINPOS}->{$chart_id}->{$current_chart_pos}->{$node_id}->{($i+1)}); |
| print STDERR " TO.$chart_id I: $i POS: $current_chart_pos NODE: $node_id FROM: $to_linear_chart_pos\n" if $verbose; |
| my $roman_char = $roman_chars[$i]; |
| $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{$roman_char} = 1; |
| } |
| } else { |
| my $from_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{$current_chart_pos}; |
| my $to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+1)}; |
| |
| my $i = 1; |
| while (! (defined($to_linear_chart_pos))) { |
| $i++; |
| $to_linear_chart_pos = $sd_ht{POS2LINPOS}->{$chart_id}->{($current_chart_pos+$i)}; |
| } |
| if (defined($from_linear_chart_pos) && defined($to_linear_chart_pos)) { |
| $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}->{""} = 1 |
| } else { |
| print STDERR " UNDEF.$chart_id from: " |
| . ((defined($from_linear_chart_pos)) ? $from_linear_chart_pos : "?") |
| . " to: " |
| . ((defined($to_linear_chart_pos)) ? $to_linear_chart_pos : "?") |
| . "\n"; |
| } |
| } |
| } |
| $current_chart_pos++; |
| } |
| $sd_ht{MAXLINPOS}->{$chart_id} = $sd_ht{POS2LINPOS}->{$chart_id}->{$end_chart_pos}; |
| } |
|
|
| sub expand_lin_ij_roman { |
| local($this, *sd_ht, $chart_id, $lang_code, *ht) = @_; |
|
|
| foreach $start (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) { |
| foreach $end (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}}) { |
| foreach $roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$end}}) { |
| if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman} |
| || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman}) { |
| $this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht); |
| } |
| } |
| } |
| } |
| } |
|
|
| sub expand_lin_ij_roman_rec { |
| local($this, *sd_ht, $chart_id, $start, $end, $roman, $lang_code, *ht) = @_; |
|
|
| |
| return unless $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$roman} |
| || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman}; |
| foreach $new_end (keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}}) { |
| foreach $next_roman (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$end}->{$new_end}}) { |
| my $exp_roman = join("", $roman, $next_roman); |
| if ($ht{RULE_STRING}->{$lang_code}->{$exp_roman} |
| || $ht{RULE_STRING}->{""}->{$exp_roman}) { |
| $sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start}->{$new_end}->{$exp_roman} = 1; |
| |
| } |
| if ($ht{RULE_STRING_HAS_EXPANSION}->{$lang_code}->{$exp_roman} |
| || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$exp_roman}) { |
| $this->expand_lin_ij_roman_rec(*sd_ht, $chart_id, $start, $new_end, $exp_roman, $lang_code, *ht); |
| } |
| } |
| } |
| } |
|
|
| sub trace_string_distance { |
| local($this, *sd_ht, $chart1_id, $chart2_id, $control, $line_number, $cost) = @_; |
|
|
| my $chart_comb_id = join("/", $chart1_id, $chart2_id); |
| return "mismatch" if $sd_ht{MISMATCH}->{$chart_comb_id}; |
| my $chart1_end = $sd_ht{MAXLINPOS}->{$chart1_id}; |
| my $chart2_end = $sd_ht{MAXLINPOS}->{$chart2_id}; |
| my $verbose = ($control =~ /verbose/); |
| my $chunks_p = ($control =~ /chunks/); |
| my @traces = (); |
| my @s1_s = (); |
| my @s2_s = (); |
| my @e1_s = (); |
| my @e2_s = (); |
| my @r1_s = (); |
| my @r2_s = (); |
| my @ic_s = (); |
|
|
| |
| while ($chart1_end || $chart2_end) { |
| my $incr_cost = $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| my $prec_i = $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| my $prec_j = $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| if ($incr_cost || $verbose || $chunks_p) { |
| my $roman1 = $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| my $roman2 = $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| if ($verbose) { |
| push(@traces, "$prec_i-$chart1_end/$prec_j-$chart2_end:$roman1/$roman2:$incr_cost"); |
| } else { |
| if (defined($roman1)) { |
| push(@traces, "$roman1/$roman2:$incr_cost"); |
| } else { |
| $print_prec_i = (defined($prec_i)) ? $prec_i : "?"; |
| $print_prec_j = (defined($prec_j)) ? $prec_j : "?"; |
| print STDERR " $prec_i-$chart1_end, $prec_j-$chart2_end\n"; |
| } |
| } |
| if ($chunks_p) { |
| push(@s1_s, $prec_i); |
| push(@s2_s, $prec_j); |
| push(@e1_s, $chart1_end); |
| push(@e2_s, $chart2_end); |
| push(@r1_s, $roman1); |
| push(@r2_s, $roman2); |
| push(@ic_s, $incr_cost); |
| } |
| } |
| $chart1_end = $prec_i; |
| $chart2_end = $prec_j; |
| } |
| if ($chunks_p) { |
| my $r1 = ""; |
| my $r2 = ""; |
| my $tc = 0; |
| my $in_chunk = 0; |
| foreach $i ((0 .. $#ic_s)) { |
| if ($ic_s[$i]) { |
| $r1 = $r1_s[$i] . $r1; |
| $r2 = $r2_s[$i] . $r2; |
| $tc += $ic_s[$i]; |
| $in_chunk = 1; |
| } elsif ($in_chunk) { |
| $chunk = "$r1/$r2/$tc"; |
| $chunk .= "*" if $cost > 5; |
| $sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1; |
| $sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1; |
| $r1 = ""; |
| $r2 = ""; |
| $tc = 0; |
| $in_chunk = 0; |
| } |
| } |
| if ($in_chunk) { |
| $chunk = "$r1/$r2/$tc"; |
| $chunk .= "*" if $cost > 5; |
| $sd_ht{N_COST_CHUNK}->{$chunk} = ($sd_ht{N_COST_CHUNK}->{$chunk} || 0) + 1; |
| $sd_ht{EX_COST_CHUNK}->{$chunk}->{$line_number} = 1; |
| } |
| } else { |
| return join(" ", reverse @traces); |
| } |
| } |
|
|
| sub right_context_match { |
| local($this, $right_context_rule, *sd_ht, $chart_id, $start_pos) = @_; |
| |
| return 1 if $right_context_rule eq ""; |
| if (($right_context_item, $right_context_rest) = ($right_context_rule =~ /^\[([^\[\]]*)\]*(.*)$/)) { |
| my $guarded_right_context_item = $right_context_item; |
| $guarded_right_context_item =~ s/\$/\\\$/g; |
| my @end_positions = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}}; |
| return 1 if ($#end_positions == -1) |
| && (($right_context_item eq "") |
| || ($right_context_item =~ /\$/)); |
| foreach $end_pos (@end_positions) { |
| my @romans = keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$start_pos}->{$end_pos}}; |
| foreach $roman (@romans) { |
| if ($roman =~ /^[$guarded_right_context_item]/) { |
| return $this->right_context_match($right_context_rest, *sd_ht, $chart_id, $end_pos); |
| } |
| } |
| } |
| } |
| return 0; |
| } |
|
|
| sub string_distance { |
| local($this, *sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control) = @_; |
|
|
| my $verbose = ($control =~ /verbose/i); |
| my $chart_comb_id = join("/", $chart1_id, $chart2_id); |
|
|
| my $chart1_end_pos = $sd_ht{MAXLINPOS}->{$chart1_id}; |
| my $chart2_end_pos = $sd_ht{MAXLINPOS}->{$chart2_id}; |
| print STDERR "string_distance.$chart_comb_id $chart1_end_pos/$chart2_end_pos\n" if $verbose; |
| $sd_ht{COST_IJ}->{$chart_comb_id}->{0}->{0} = 0; |
| $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{0}->{0} = ""; |
| $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{0}->{0} = ""; |
| |
| foreach $chart1_start ((0 .. $chart1_end_pos)) { |
| |
| my $prev_further_expansion_possible = 0; |
| my @chart1_ends = sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}}; |
| my $max_chart1_ends = (@chart1_ends) ? $chart1_ends[$#chart1_ends] : -1; |
| foreach $chart1_end (($chart1_start .. $chart1_end_pos)) { |
| my $further_expansion_possible = ($chart1_start == $chart1_end) |
| || defined($sd_ht{LINPOS2SPLITPOS}->{$chart1_id}->{$chart1_start}) |
| || ($chart1_end < $max_chart1_ends); |
| my @romans1 = (($chart1_start == $chart1_end) |
| ? ("") |
| : (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart1_id}->{$chart1_start}->{$chart1_end}})); |
| if ($#romans1 == -1) { |
| $further_expansion_possible = 1 if $prev_further_expansion_possible; |
| } else { |
| $prev_further_expansion_possible = 0; |
| } |
| |
| foreach $roman1 (@romans1) { |
| |
| next unless $ht{RULE_STRING}->{$lang_code1}->{$roman1} |
| || $ht{RULE_STRING}->{""}->{$roman1}; |
| |
| foreach $lang_code1o (($lang_code1, "")) { |
| foreach $lang_code2o (($lang_code2, "")) { |
| my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}}); |
| foreach $chart2_start (@chart2_starts) { |
| |
| foreach $chart2_end (($chart2_start .. $chart2_end_pos)) { |
| print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end\n"; |
| my @romans2 = (($chart2_start == $chart2_end) |
| ? ("") |
| : (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart2_id}->{$chart2_start}->{$chart2_end}})); |
| foreach $roman2 (@romans2) { |
| if ($roman1 eq $roman2) { |
| print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2 (IDENTITY)\n"; |
| my $cost = 0; |
| my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start}; |
| my $combined_cost = $preceding_cost + $cost; |
| my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| if ((! defined($old_cost)) || ($combined_cost < $old_cost)) { |
| $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost; |
| push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts); |
| $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start; |
| $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start; |
| $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1; |
| $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2; |
| $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} |
| = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1; |
| $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} |
| = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2; |
| $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost; |
| $sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = "IDENTITY"; |
| print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose; |
| } |
| } else { |
| next unless $ht{RULE_STRING}->{$lang_code2o}->{$roman2}; |
| print STDERR " C1 $chart1_start-$chart1_end $roman1 C2 $chart2_start-$chart2_end $roman2\n"; |
| next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}); |
| my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}}; |
| foreach $cost_rule_id (@cost_rule_ids) { |
| |
| |
| my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id}; |
| if ($left_context_rule1) { |
| my $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start}; |
| if (defined($comb_left_roman1)) { |
| next unless $comb_left_roman1 =~ /$left_context_rule1/; |
| } else { |
| print STDERR " No comb_left_roman1 value for $chart_comb_id $chart1_start,$chart2_start\n"; |
| } |
| } |
| my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id}; |
| if ($left_context_rule2) { |
| my $comb_left_roman2 = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start}; |
| if (defined($comb_left_roman2)) { |
| next unless $comb_left_roman2 =~ /$left_context_rule2/; |
| } else { |
| print STDERR " No comb_left_roman2 value for $chart_comb_id $chart1_start,$chart2_start\n"; |
| } |
| } |
| my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id}; |
| if ($right_context_rule1) { |
| my $match_p = $this->right_context_match($right_context_rule1, *sd_ht, $chart1_id, $chart1_end); |
| |
| next unless $match_p; |
| } |
| my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id}; |
| if ($right_context_rule2) { |
| my $match_p = $this->right_context_match($right_context_rule2, *sd_ht, $chart2_id, $chart2_end); |
| |
| next unless $match_p; |
| } |
| my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$roman1}->{$roman2}->{$cost_rule_id}; |
| my $preceding_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_start}->{$chart2_start}; |
| my $combined_cost = $preceding_cost + $cost; |
| my $old_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| if ((! defined($old_cost)) || ($combined_cost < $old_cost)) { |
| $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $combined_cost; |
| push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts); |
| $sd_ht{PREC_I}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart1_start; |
| $sd_ht{PREC_J}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $chart2_start; |
| $sd_ht{ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman1; |
| $sd_ht{ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $roman2; |
| $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} |
| = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman1; |
| $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} |
| = $sd_ht{COMB_LEFT_ROMAN2}->{$chart_comb_id}->{$chart1_start}->{$chart2_start} . $roman2; |
| $comb_left_roman1 = $sd_ht{COMB_LEFT_ROMAN1}->{$chart_comb_id}->{$chart1_end}->{$chart2_end}; |
| |
| $sd_ht{INCR_COST_IJ}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost; |
| $sd_ht{COST_RULE}->{$chart_comb_id}->{$chart1_end}->{$chart2_end} = $cost_rule_id; |
| print STDERR " New cost $chart1_end/$chart2_end: $combined_cost (+$cost from $chart1_start/$chart2_start $roman1/$roman2)\n" if $verbose; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| $further_expansion_possible = 1 |
| if $ht{RULE_STRING_HAS_EXPANSION}->{$lang_code1}->{$roman1} |
| || $ht{RULE_STRING_HAS_EXPANSION}->{""}->{$roman1}; |
| |
| } |
| |
| last unless $further_expansion_possible; |
| $prev_further_expansion_possible = 1 if $further_expansion_possible; |
| } |
| } |
| my $total_cost = $sd_ht{COST_IJ}->{$chart_comb_id}->{$chart1_end_pos}->{$chart2_end_pos}; |
| unless (defined($total_cost)) { |
| $total_cost = 99.9999; |
| $sd_ht{MISMATCH}->{$chart_comb_id} = 1; |
| } |
| return $total_cost; |
| } |
|
|
| sub print_sd_ht { |
| local($this, *sd_ht, $chart1_id, $chart2_id, *OUT) = @_; |
|
|
| print OUT "string-distance chart:\n"; |
| foreach $chart_id (($chart1_id, $chart2_id)) { |
| print OUT "SD chart $chart_id:\n"; |
| foreach $from_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}}) { |
| foreach $to_linear_chart_pos (sort { $a <=> $b } keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}}) { |
| foreach $roman_char (sort keys %{$sd_ht{LIN_IJ_ROMAN}->{$chart_id}->{$from_linear_chart_pos}->{$to_linear_chart_pos}}) { |
| print OUT " Lnode($from_linear_chart_pos-$to_linear_chart_pos): $roman_char\n"; |
| } |
| } |
| } |
| } |
| } |
|
|
| sub print_chart_ht { |
| local($this, *chart_ht, *OUT) = @_; |
|
|
| print OUT "uroman chart:\n"; |
| foreach $start (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AT}}) { |
| foreach $end (sort { $a <=> $b } keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}}) { |
| foreach $node_id (keys %{$chart_ht{NODES_STARTING_AND_ENDING_AT}->{$start}->{$end}}) { |
| $roman_s = $chart_ht{NODE_ROMAN}->{$node_id}; |
| print OUT " Node $node_id ($start-$end): $roman_s\n"; |
| } |
| } |
| } |
| } |
|
|
| sub normalize_string { |
| local($this, $s) = @_; |
|
|
| |
| $s =~ s/(\xE2\x80[\x93-\x94])/-/g; |
| $s =~ s/([\x00-\x7F\xC0-\xFE][\x80-\xBF]*)\1+/$1$1/g; |
| $s =~ s/[ \t]+/ /g; |
|
|
| return $s; |
| } |
|
|
| my $string_distance_chart_id = 0; |
| sub string_distance_by_chart { |
| local($this, $s1, $s2, $lang_code1, $lang_code2, *ht, *pinyin_ht, $control) = @_; |
|
|
| $control = "" unless defined($control); |
| %sd_ht = (); |
|
|
| $s1 = $this->normalize_string($s1); |
| my $lc_s1 = $utf8->extended_lower_case($s1); |
| $string_distance_chart_id++; |
| my $chart1_id = $string_distance_chart_id; |
| *chart_ht = $romanizer->romanize($lc_s1, $lang_code1, "", *ht, *pinyin_ht, 0, "return chart", $chart1_id); |
| $this->linearize_chart_points(*chart_ht, $chart1_id, *sd_ht); |
| $this->expand_lin_ij_roman(*sd_ht, $chart1_id, $lang_code1, *ht); |
|
|
| $s2 = $this->normalize_string($s2); |
| my $lc_s2 = $utf8->extended_lower_case($s2); |
| $string_distance_chart_id++; |
| my $chart2_id = $string_distance_chart_id; |
| *chart_ht = $romanizer->romanize($lc_s2, $lang_code2, "", *ht, *pinyin_ht, 0, "return chart", $chart2_id); |
| $this->linearize_chart_points(*chart_ht, $chart2_id, *sd_ht); |
| $this->expand_lin_ij_roman(*sd_ht, $chart2_id, $lang_code2, *ht); |
|
|
| my $cost = $this->string_distance(*sd_ht, $chart1_id, $chart2_id, $lang_code1, $lang_code2, *ht, $control); |
| return $cost; |
| } |
|
|
| my $n_quick_romanized_string_distance = 0; |
| sub quick_romanized_string_distance_by_chart { |
| local($this, $s1, $s2, *ht, $control, $lang_code1, $lang_code2) = @_; |
|
|
| |
| |
| $s1 = lc $s1; |
| $s2 = lc $s2; |
| $control = "" unless defined($control); |
| $lang_code1 = "" unless defined($lang_code1); |
| $lang_code2 = "" unless defined($lang_code2); |
| my $cache_p = ($control =~ /cache/); |
| my $total_cost; |
| if ($cache_p) { |
| $total_cost = $ht{CACHED_QRSD}->{$s1}->{$s2}; |
| if (defined($total_cost)) { |
| return $total_cost; |
| } |
| } |
| my @lang_codes1 = ($lang_code1 eq "") ? ("") : ($lang_code1, ""); |
| my @lang_codes2 = ($lang_code2 eq "") ? ("") : ($lang_code2, ""); |
| my $chart1_end_pos = length($s1); |
| my $chart2_end_pos = length($s2); |
| my %sd_ht = (); |
| $sd_ht{COST_IJ}->{0}->{0} = 0; |
| foreach $chart1_start ((0 .. $chart1_end_pos)) { |
| foreach $chart1_end (($chart1_start .. $chart1_end_pos)) { |
| my $substr1 = substr($s1, $chart1_start, ($chart1_end-$chart1_start)); |
| foreach $lang_code1o (@lang_codes1) { |
| foreach $lang_code2o (@lang_codes2) { |
| |
| } |
| } |
| my @chart2_starts = (sort { $a <=> $b } keys %{$sd_ht{COST_IJ}->{$chart1_start}}); |
| foreach $chart2_start (@chart2_starts) { |
| foreach $chart2_end (($chart2_start .. $chart2_end_pos)) { |
| my $substr2 = substr($s2, $chart2_start, ($chart2_end-$chart2_start)); |
| foreach $lang_code1o (@lang_codes1) { |
| foreach $lang_code2o (@lang_codes2) { |
| if ($substr1 eq $substr2) { |
| my $cost = 0; |
| my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start}; |
| if (defined($preceding_cost)) { |
| my $combined_cost = $preceding_cost + $cost; |
| my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end}; |
| if ((! defined($old_cost)) || ($combined_cost < $old_cost)) { |
| $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost; |
| push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts); |
| } |
| } |
| } else { |
| next unless defined($ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}); |
| my @cost_rule_ids = keys %{$ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}}; |
| my $best_cost = 99.99; |
| foreach $cost_rule_id (@cost_rule_ids) { |
| my $cost = $ht{COST}->{$lang_code1o}->{$lang_code2o}->{$substr1}->{$substr2}->{$cost_rule_id}; |
| my $left_context_rule1 = $ht{LEFT1}->{$cost_rule_id}; |
| next if $left_context_rule1 |
| && (! (substr($s1, 0, $chart1_start) =~ /$left_context_rule1/)); |
| my $left_context_rule2 = $ht{LEFT2}->{$cost_rule_id}; |
| next if $left_context_rule2 |
| && (! (substr($s2, 0, $chart2_start) =~ /$left_context_rule2/)); |
| my $right_context_rule1 = $ht{RIGHT1}->{$cost_rule_id}; |
| my $right_context1 = substr($s1, $chart1_end); |
| next if $right_context_rule1 |
| && (! (($right_context1 =~ /^$right_context_rule1/) |
| || (($right_context_rule1 =~ /^\[[^\[\]]*\$/) |
| && ($right_context1 eq "")))); |
| my $right_context_rule2 = $ht{RIGHT2}->{$cost_rule_id}; |
| my $right_context2 = substr($s2, $chart2_end); |
| next if $right_context_rule2 |
| && (! (($right_context2 =~ /^$right_context_rule2/) |
| || (($right_context_rule2 =~ /^\[[^\[\]]*\$/) |
| && ($right_context2 eq "")))); |
| $best_cost = $cost if $cost < $best_cost; |
| my $preceding_cost = $sd_ht{COST_IJ}->{$chart1_start}->{$chart2_start}; |
| my $combined_cost = $preceding_cost + $cost; |
| my $old_cost = $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end}; |
| if ((! defined($old_cost)) || ($combined_cost < $old_cost)) { |
| $sd_ht{COST_IJ}->{$chart1_end}->{$chart2_end} = $combined_cost; |
| push(@chart2_starts, $chart2_end) unless $util->member($chart2_end, @chart2_starts); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| $total_cost = $sd_ht{COST_IJ}->{$chart1_end_pos}->{$chart2_end_pos}; |
| $total_cost = 99.99 unless defined($total_cost); |
| $ht{CACHED_QRSD}->{$s1}->{$s2} = $total_cost if $cache_p; |
| $n_quick_romanized_string_distance++; |
| return $total_cost; |
| } |
|
|
| sub get_n_quick_romanized_string_distance { |
| return $n_quick_romanized_string_distance; |
| } |
|
|
| 1; |
|
|
|
|