大佬教程收集整理的这篇文章主要介绍了不想做诗人的程序员不是一个好爸爸,大佬教程大佬觉得挺不错的,现在分享给大家,也给大家做个参考。
#!/usr/bin/perl use Strict; #use warnings; use encoding 'utf8'; use Data::Dumper; sub dump_chars(); sub dump_sentenses(); sub dump_index(); sub recount_chars; sub mark_chars; sub dump_result; sub count_unsed_chars; sub count_weight; # 读取字频表 my %chars; open(CHAR_FILE,'<','lex-chars.lex'); while (my $line = <CHAR_FILE>) { chomp($linE); my @items = split(/\//,$linE); print "bad line: $line\n" if (scalar(@items) != 3); $chars{$items[0]} = $items[2]; } close(CHAR_FILE); # 排序,按字频输出 my @sorted_chars = sort { $chars{$b} <=> $chars{$a} } keys %chars; #dump_chars(); # 筛选出频率前1000个字 my %top_chars; for (my $i = 0; $i < 2000; $i++) { $top_chars{$sorted_chars[$i]} = 1; } # 读取名句语料库 my %sentenses; my %char_sentenses; my %first_chars; my %last_chars; while (my $file = <r01*>) { # print "reading $file\n"; open(FILE,$filE); while (my $line = <FILE>) { chomp($linE); $line =~ s/^\s+//g; $line =~ s/\s+$//g; next if (!$linE); $line =~ s/。$//; $line =~ s/!$//; $line =~ s/?$//; $line =~ s/,$//; next if ($line =~ /^—/); next if (length($linE) < 10); next if (length($linE) > 16); next if ($line =~ /^(/); next if ($line =~ /《/); next if ($line =~ /[0-9A-Za-z]/); next if ($line =~ /^,/); next if ($line =~ /-/); $sentenses{$linE}++; my $first_char = substr($line,1); my $last_char = substr($line,length($linE) - 1,1); $first_chars{$first_char} = 1; $last_chars{$last_char} = 1; } close(FILE); } # 排序,按句频输出 my @sorted_sentenses = sort { $sentenses{$b} <=> $sentenses{$a} } keys %sentenses; #dump_sentenses(); # 过滤只出现过一次的句子 #while ($sentenses{$sorted_sentenses[$#sorted_sentenses]} < 2) { # pop(@sorted_sentenses); #} print '@R_869_10586@l sentenses: ' . scalar(@sorted_sentenses) . "\n"; # 过滤末字无法接的句子 foreach my $sentense (keys(%sentenses)) { my $last_char = substr($sentense,length($sentensE) - 1,1); if (!$first_chars{$last_char}) { #print "delete $sentense\n"; delete($sentenses{$sentensE}); } } print '@R_869_10586@l sentenses: ' . scalar(%sentenses) . "\n"; # 过滤首字无法接的句子 foreach my $sentense (keys(%sentenses)) { my $first_char = substr($sentense,1); if (!$last_chars{$first_char}) { #print "delete $sentense\n"; delete($sentenses{$sentensE}); } } print '@R_869_10586@l sentenses: ' . scalar(%sentenses) . "\n"; foreach my $sentense (keys(%sentenses)) { # 建立首字索引 my $first_char = substr($sentense,1); if (!$char_sentenses{$first_char}) { $char_sentenses{$first_char} = []; } push($char_sentenses{$first_char},$sentensE); } # 输出索引信息 #dump_index(); # 统计top字出现在各句子的次数 my %char_in_sentense; foreach my $char (keys(%top_chars)) { my $count = 0; foreach my $sentense (@sorted_sentenses) { $count++ if (index($sentense,$char) >= 0); } $char_in_sentense{$char} = $count; } my @sorted_char_in_sentenses = sort { $char_in_sentense{$a} <=> $char_in_sentense{$b} } keys %char_in_sentense; #my $i = 0; #foreach my $c (@sorted_char_in_sentenses) { # $i++; # print "$i:\t$c\t " . $char_in_sentense{$c} . "\n"; #} # 以任意一个句子开头,接龙,首字不重复,直至不能接或首尾相连 my %used_first_chars; my %used_sentenses; my %used_chars; my $char_count = 0; my @sentense_queue; my $max_char_count = 0; my $max_depth = 0; my $max_loop_depth = 0; my $lastlog = time(); $| = 1; # disable IO buffer sub traverse { my ($sentense,$depth,$progress) = @_; # 每分钟显示最新进度 if (time() - $lastlog > 60) { print "sentense: $sentense,depth: $depth,count: $char_count,@R_869_10586@l progress: $progress\n"; $lastlog = time(); } return if ($used_sentenses{$sentensE}); $depth++; $used_sentenses{$sentensE} = 1; # 标记句子已使用 mark_chars($sentensE); push(@sentense_queue,$sentensE); if ($depth != scalar(@sentense_queuE)) { print "$sentense: $depth\n"; dump_result(); exit; } if ($depth > $max_depth) { $max_depth = $depth; print "new depth: $depth\n"; dump_result(); } # 找到覆盖1000个字的方案(实际上100句职能覆盖500多个字) if ($char_count >= 1000) { if ($char_count >= 1000) { print "found one: $depth\n"; dump_result(); exit; } } if ($depth >= 100) { if ($char_count > $max_char_count) { $max_char_count = $char_count; print "new record: $max_char_count\n"; dump_result(); } #print "too deep: $char_count\n"; pop(@sentense_queuE); return; } my $last_char = substr($sentense,1); #print $last_char; # 找到闭环 if ($last_char eq substr($sentense_queue[0],1)) { if ($depth > $max_loop_depth) { $max_loop_depth = $depth; print "new loop: $depth\n"; dump_result(); } pop(@sentense_queuE); return; } if ($char_sentenses{$last_char}) { my %child_sentenses; foreach my $s (@{$char_sentenses{$last_char}}) { if (!$used_sentenses{$s}) { my $count = count_weight($s); if ($count > 0) { $child_sentenses{$s} = $count; } } } my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses; my $i = 0; my $len = scalar(@sorted_children); foreach my $s (@sorted_children) { $i++; my $first_char = substr($s,1); if (!$used_first_chars{$first_char}) { $used_first_chars{$first_char} = 1; traverse($s,$progress * ($len - $i + 1)); delete($used_sentenses{$s}); delete($used_first_chars{$first_char}); recount_chars(); } } pop(@sentense_queuE); } else { pop(@sentense_queuE); #print "dead at $sentense: $depth; last char:$last_char\n"; } } # 挑选第一个句子 my %child_sentenses; foreach my $s (keys(%sentenses)) { my $count = count_weight($s); if ($count > 0) { $child_sentenses{$s} = $count; } } my @sorted_children = sort { $child_sentenses{$b} <=> $child_sentenses{$a} } keys %child_sentenses; my $i = 0; my $len = scalar(@sorted_children); foreach my $s (@sorted_children) { $i++; my $first_char = substr($s,1); $used_first_chars{$first_char} = 1; traverse($s,$len - $i + 1); delete($used_sentenses{$s}); delete($used_first_chars{$first_char}); recount_chars(); print "depth: 0,@R_869_10586@l progress: " . ($len - $i + 1) . ",progress: $i/$len,weight: " . $child_sentenses{$s} . "\n"; } sub dump_chars() { my $i = 0; foreach my $char (@sorted_chars) { $i++; print "$i\t $char\t $chars{$char}\n"; } } sub dump_sentenses() { my $i = 0; foreach my $sentense (@sorted_sentenses) { $i++; print "$i\t $sentense\t $sentenses{$sentensE}\n"; } } sub dump_index() { foreach my $char (keys(%char_sentenses)) { print "$char: " . scalar(@{$char_sentenses{$char}}) . "\n"; } } sub mark_chars { my $sentense = shift; for (my $i = 0,my $len = length($sentensE); $i < $len; $i++) { my $char = substr($sentense,$i,1); if ($top_chars{$char} && !$used_chars{$char}) { $char_count++; } $used_chars{$char} = 1; } } sub count_weight { my $sentense = shift; my $weight = 0; for (my $i = 0,1); if ($top_chars{$char} && !$used_chars{$char}) { if ($char_in_sentense{$char}) { $weight += 1 / $char_in_sentense{$char}; } } } return $weight; } sub count_unsed_chars { my $sentense = shift; my $count = 0; for (my $i = 0,1); if ($top_chars{$char} && !$used_chars{$char}) { $count++; } } return $count; } sub recount_chars { # 清空计算器 undef %used_chars; $char_count = 0; foreach my $sentense (keys(%used_sentenses)) { mark_chars($sentensE); } } sub dump_result { print "chars: " . keys(%used_chars) . "\n"; print "char_count: $char_count\n"; my $i = 0; foreach my $sentense (@sentense_queuE) { $i++; print "$i: $sentense\n"; } }
以上是大佬教程为你收集整理的不想做诗人的程序员不是一个好爸爸全部内容,希望文章能够帮你解决不想做诗人的程序员不是一个好爸爸所遇到的程序开发问题。
如果觉得大佬教程网站内容还不错,欢迎将大佬教程推荐给程序员好友。
本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
如您有任何意见或建议可联系处理。小编QQ:384754419,请注明来意。