; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i wfl-bg.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D 0.013192;-0.681737;0.722663;-0.108477;0.015373;0.028548; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X C ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.362332 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 20070 ; Number of lines found in word/lemma list: 55198 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 9407.000000 5070.000000 1635.000000 955.000000 679.000000 539.000000 ; rules% 20.900729 11.264664 3.632688 2.121845 1.508621 1.197565 ; same% 82.460896 75.879844 71.853893 70.098649 68.987736 68.447831 ; ambi1% 8.429613 8.698454 7.378688 6.821010 6.445521 6.074476 ; ambi2% 8.429613 7.865268 5.961162 5.490135 5.287949 4.983558 ; ambi3% 0.679879 0.435478 0.173303 0.146641 0.151084 0.075542 ; diff% 0.000000 7.120956 14.632954 17.443566 19.127711 20.418592 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 17.539104 18.034572 15.541681 14.724049 14.175258 13.366513 ; false_amb% 0.000000 3.703786 4.732492 5.059101 5.050213 4.703608 ; false_not_amb% 0.000000 3.208319 6.729915 7.874156 8.414060 8.876200 ; true_amb% 17.539104 14.330786 10.809189 9.664948 9.125044 8.662904 ; true_not_amb% 82.460896 78.757110 77.728404 77.401795 77.410683 77.757288 ; precision 1.000000 0.659240 0.533151 0.488544 0.474633 0.479405 ; recall 1.000000 0.817076 0.616291 0.551051 0.520269 0.493919 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 9279.647059 4994.941176 1613.000000 944.294118 672.000000 535.411765 ; rules% 20.920533 11.260863 3.636434 2.128867 1.514993 1.207061 ; same% 66.576357 67.091123 66.838255 66.260273 66.206087 66.034498 ; ambi1% 7.387339 7.071254 6.755170 6.511334 6.186219 5.897227 ; ambi2% 5.815949 5.969475 5.824980 5.265059 4.930913 4.804479 ; ambi3% 0.243836 0.216743 0.180620 0.162558 0.099341 0.081279 ; diff% 19.976519 19.651404 20.400975 21.800777 22.577441 23.182516 ; same%stdev 3.077565 2.858220 2.875605 2.613944 2.788026 2.743223 ; ambi1%stdev 1.021196 1.018685 1.080764 0.923700 0.872987 0.743980 ; ambi2%stdev 0.985847 0.846035 1.043048 0.883411 0.763830 0.792024 ; ambi3%stdev 0.215120 0.191849 0.118378 0.126253 0.109416 0.079884 ; diff%stdev 3.112734 2.710415 2.891265 2.868935 3.022665 3.085296 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 16.328005 15.966766 15.172040 14.377314 13.600650 13.094916 ; false_amb% 0.487673 0.424456 0.397363 0.352208 0.334146 0.307053 ; false_not_amb% 0.623137 0.659261 0.677323 0.686354 0.650230 0.659261 ; true_amb% 0.632168 0.596044 0.577982 0.568952 0.605075 0.596044 ; true_not_amb% 4.389054 4.452271 4.479364 4.524519 4.542581 4.569674 ; precision 0.393258 0.412500 0.421053 0.446809 0.475177 0.492537 ; recall 0.503597 0.474820 0.460432 0.453237 0.482014 0.474820 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 1.072*N^0.846 0.635*N^0.837 0.249*N^0.818 0.174*N^0.804 0.124*N^0.808 0.095*N^0.814 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %