| 52 | | Usage: |
| | 53 | === Sample output === |
| | 54 | {{{ |
| | 55 | <doc source="https://en.wikipedia.org/wiki/Dog" lang="english" |
| | 56 | lang_scores="english: 49.56, czech: 19.86, slovak: 20.15"> |
| | 57 | <par_langs lang="english" lang_scores="english: 49.56, czech: 19.86, slovak: 20.15"/> |
| | 58 | <p> |
| | 59 | #wordform English Czech Slovak score for each word |
| | 60 | Linnaeus 0.00 0.00 0.00 #unknown to all sample wordlists |
| | 61 | considered 5.18 0.00 0.00 #English only |
| | 62 | the 7.82 5.26 5.33 #English word, ~100 x more frequent in the English wl |
| | 63 | dog 4.89 0.00 0.00 |
| | 64 | to 7.48 7.05 7.15 #a valid word in all three languages |
| | 65 | be 6.77 0.00 0.00 |
| | 66 | a 7.37 7.56 7.66 #a valid word in all three languages |
| | 67 | separate 4.91 0.00 0.00 |
| | 68 | species 5.14 0.00 0.00 |
| | 69 | <g/> |
| | 70 | . 0.00 0.00 0.00 #punctuation is omitted from wordlists |
| | 71 | </p> |
| | 72 | </doc> |
| | 73 | }}} |
| | 74 | |
| | 75 | |
| | 76 | == Installation == |
| | 77 | {{{ |
| | 78 | wget http://corpus.tools/raw-attachment/wiki/Downloads/wcwb_lang_filter_1.0.tar.gz |
| | 79 | tar -czvf wcwb_lang_filter_1.0.tar.gz |
| | 80 | cd wcwb_lang_filter_1.0 |
| | 81 | make test/out.vert.lang_czech |
| | 82 | }}} |
| | 83 | |
| | 84 | == Usage == |
| | 128 | == To build your own frequency wordlist == |
| | 129 | {{{ |
| | 130 | #Get corpus frequencies of lowercased words from a corpus compiled by [https://nlp.fi.muni.cz/trac/noske Sketch Engine] |
| | 131 | lsclex -f /corpora/registry/english_web_corpus lc | cut -f2,3 | ./uninorm_4.py | perl -pe 's, (\d+)$,\t$1,' > en.wl1 |
| | 132 | lsclex -f /corpora/registry/czech_web_corpus lc | cut -f2,3 | ./uninorm_4.py | perl -pe 's, (\d+)$,\t$1,' > cs.wl1 |
| | 133 | lsclex -f /corpora/registry/slovak_web_corpus lc | cut -f2,3 | ./uninorm_4.py | perl -pe 's, (\d+)$,\t$1,' > sk.wl1 |
| | 134 | |
| | 135 | #Or get the same from a vertical file |
| | 136 | cut -f1 english_web_corpus.vert | grep -v '^<' | tr '[:upper:]' '[:lower:]' | sort | uniq -c | perl -pe 's,^\s*(\d+) (.*)$,$2\t$1,' > en.wl1 |
| | 137 | cut -f1 czech_web_corpus.vert | grep -v '^<' | tr '[:upper:]' '[:lower:]' | sort | uniq -c | perl -pe 's,^\s*(\d+) (.*)$,$2\t$1,' > cs.wl1 |
| | 138 | cut -f1 slovak_web_corpus.vert | grep -v '^<' | tr '[:upper:]' '[:lower:]' | sort | uniq -c | perl -pe 's,^\s*(\d+) (.*)$,$2\t$1,' > sk.wl1 |
| | 139 | |
| | 140 | #Filter the wordlist -- allow just characters valid for the language and a reasonable word length |
| | 141 | grep '[abcdefghijklmnopqrstuvwxyz]' en.wl1 | grep -v -P "['.-]{2}" | ./wl_grep.py "[#@]?[abcdefghijklmnopqrstuvwxyzéè0-9'][abcdefghijklmnopqrstuvwxyzéè0-9'.-]{0,29}" > en.wl2 |
| | 142 | grep '[aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzž]' cs.wl1 | grep -v -P "['.-]{2}" | ./wl_grep.py "[#@]?[aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzž0-9'][aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzž0-9'.-]{0,29}" > cs.wl2 |
| | 143 | grep '[aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž]' sk.wl1 | grep -v -P "['.-]{2}" | ./wl_grep.py "[#@]?[aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž0-9'][aáäbcčdďeéfghiíjklĺľmnňoóôpqrŕsštťuúvwxyýzž0-9'.-]{0,29}" > sk.wl2 |
| | 144 | |
| | 145 | #Sort (not necessary) and pack |
| | 146 | for f in {en,cs,sk}.wl2; do sort -k2,2rg -k1,1 ${c}.wl2 $f | gzip > ${f}.frqwl.gz; done |
| | 147 | }}} |
| | 148 | |