; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i wfl-et.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D 0.012820;-0.706711;0.698674;-0.034104;0.011938;0.104611; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X C ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.148046 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 36379 ; Number of lines found in word/lemma list: 135093 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 20219.000000 11395.000000 4126.000000 2562.000000 1897.000000 1555.000000 ; rules% 19.612008 11.052912 4.002134 2.485087 1.840050 1.508318 ; same% 76.350938 71.162520 67.375721 65.712207 64.681119 63.670401 ; ambi1% 10.550463 10.289539 8.986857 8.369950 7.913090 7.732674 ; ambi2% 10.550463 9.515495 7.612396 6.955720 6.558999 6.435812 ; ambi3% 2.548135 2.267811 1.870120 1.798341 1.762452 1.740143 ; diff% 0.000000 6.764635 14.154906 17.163781 19.084340 20.420971 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 23.649062 23.008875 20.430671 19.144478 18.312236 18.078471 ; false_amb% 0.000000 2.498666 3.150492 3.096173 3.018575 3.119453 ; false_not_amb% 0.000000 3.138853 6.368883 7.600757 8.355400 8.690043 ; true_amb% 23.649062 20.510209 17.280178 16.048305 15.293661 14.959018 ; true_not_amb% 76.350938 73.852272 73.200446 73.254765 73.332363 73.231486 ; precision 1.000000 0.804084 0.732796 0.721575 0.716975 0.705683 ; recall 1.000000 0.867274 0.730692 0.678602 0.646692 0.632542 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 19974.000000 11247.272727 4073.818182 2530.000000 1872.545455 1535.181818 ; rules% 19.653153 11.066605 4.008380 2.489360 1.842466 1.510522 ; same% 64.045503 64.101448 63.753341 63.075775 62.317399 61.776590 ; ambi1% 8.597004 8.174302 7.807546 7.453223 7.229440 6.887549 ; ambi2% 7.241872 7.074035 6.701063 6.352956 6.147821 6.042146 ; ambi3% 1.659725 1.672158 1.672158 1.684590 1.641077 1.647293 ; diff% 18.455896 18.978057 20.065892 21.433456 22.664263 23.646423 ; same%stdev 3.705945 3.755631 3.760779 3.516667 3.518386 3.449162 ; ambi1%stdev 0.840443 0.853944 0.665207 0.678232 0.773039 0.745810 ; ambi2%stdev 0.613192 0.712302 0.626308 0.641392 0.543363 0.555550 ; ambi3%stdev 0.424388 0.394330 0.376490 0.373959 0.296105 0.336754 ; diff%stdev 2.957394 3.105253 3.189499 3.144173 3.170078 3.048049 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 20.382918 19.680487 18.499409 17.678871 17.330764 16.945360 ; false_amb% 0.609188 0.540809 0.466215 0.410269 0.447566 0.453783 ; false_not_amb% 1.255672 1.292969 1.417293 1.516753 1.609996 1.597563 ; true_amb% 2.026481 1.989184 1.864860 1.765401 1.672158 1.684590 ; true_not_amb% 6.850252 6.918630 6.993224 7.049170 7.011873 7.005657 ; precision 0.624521 0.647773 0.666667 0.682692 0.651332 0.649880 ; recall 0.617424 0.606061 0.568182 0.537879 0.509470 0.513258 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 1.821*N^0.807 0.981*N^0.811 0.234*N^0.846 0.175*N^0.828 0.149*N^0.815 0.134*N^0.805 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %