; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i wfl-hu.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D -0.032724;-0.547237;0.728254;0.212133;-0.015476;0.351947; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X E ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.312510 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 23196 ; Number of lines found in word/lemma list: 63998 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 9096.000000 5171.000000 2009.000000 1328.000000 1037.000000 858.000000 ; rules% 15.892651 9.034839 3.510151 2.320299 1.811860 1.499109 ; same% 78.956564 75.027082 72.273474 71.165741 70.222246 69.479680 ; ambi1% 10.245658 10.289339 9.354579 8.832163 8.596289 8.405843 ; ambi2% 10.245658 9.550267 8.149002 7.553203 7.294615 7.069225 ; ambi3% 0.552119 0.468253 0.246357 0.227138 0.179963 0.188699 ; diff% 0.000000 4.665059 9.976587 12.221756 13.706888 14.856554 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 21.043436 20.859978 18.925813 17.891463 17.449418 17.091239 ; false_amb% 0.000000 2.075689 2.440857 2.309816 2.341266 2.402418 ; false_not_amb% 0.000000 2.259147 4.558479 5.461788 5.935283 6.354614 ; true_amb% 21.043436 18.784289 16.484956 15.581647 15.108152 14.688821 ; true_not_amb% 78.956564 76.880875 76.515707 76.646748 76.615299 76.554146 ; precision 1.000000 0.818999 0.771527 0.771320 0.763397 0.753518 ; recall 1.000000 0.892644 0.783378 0.740452 0.717951 0.698024 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 8975.500000 5115.687500 1987.000000 1316.312500 1022.812500 848.062500 ; rules% 15.921347 9.074551 3.524674 2.334964 1.814334 1.504350 ; same% 68.815407 68.917151 69.077035 68.626453 67.928779 67.332849 ; ambi1% 10.174419 9.767442 9.178779 8.713663 8.655523 8.670058 ; ambi2% 8.808140 8.902616 8.481105 7.783430 7.572674 7.492733 ; ambi3% 0.392442 0.363372 0.276163 0.290698 0.268895 0.210756 ; diff% 11.809593 12.049419 12.986919 14.585756 15.574128 16.293605 ; same%stdev 1.916934 1.906608 1.712240 1.632885 1.726357 1.852520 ; ambi1%stdev 0.815005 0.928406 0.910816 0.853867 0.871010 0.952168 ; ambi2%stdev 0.758494 0.713681 0.711699 0.768210 0.850091 0.868232 ; ambi3%stdev 0.233217 0.263640 0.181107 0.192543 0.180380 0.154635 ; diff%stdev 1.851063 1.854773 2.064747 2.135364 2.348592 2.513910 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 20.675872 20.399709 19.171512 18.059593 17.681686 17.725291 ; false_amb% 0.174419 0.196221 0.174419 0.138081 0.138081 0.123547 ; false_not_amb% 0.377907 0.392442 0.377907 0.392442 0.436047 0.421512 ; true_amb% 0.966570 0.952035 0.966570 0.952035 0.908430 0.922965 ; true_not_amb% 4.876453 4.854651 4.876453 4.912791 4.912791 4.927326 ; precision 0.734807 0.708108 0.734807 0.775148 0.766871 0.788820 ; recall 0.718919 0.708108 0.718919 0.708108 0.675676 0.686486 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 1.766*N^0.779 1.104*N^0.771 0.400*N^0.778 0.243*N^0.787 0.148*N^0.813 0.097*N^0.837 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %