; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i wfl-uk.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D 0.008000;-0.238492;0.876932;0.278726;0.008168;0.310311; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X C ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.066602 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 13617 ; Number of lines found in word/lemma list: 300292 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 32925.000000 17525.000000 5563.000000 3386.000000 2558.000000 2099.000000 ; rules% 15.759847 8.388499 2.662780 1.620739 1.224410 1.004705 ; same% 96.664226 90.665671 86.463045 84.765242 83.872543 83.118655 ; ambi1% 1.627441 1.839008 1.180852 0.992739 0.823293 0.800318 ; ambi2% 1.627441 1.664297 0.978379 0.774949 0.661507 0.616034 ; ambi3% 0.080893 0.047866 0.011488 0.011966 0.011009 0.011009 ; diff% 0.000000 5.783158 11.366236 13.455104 14.631648 15.453984 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 3.335774 3.806775 2.577100 2.184121 1.846666 1.776782 ; false_amb% 0.000000 1.666212 1.628398 1.493895 1.323492 1.283763 ; false_not_amb% 0.000000 1.195211 2.387072 2.645548 2.812600 2.842756 ; true_amb% 3.335774 2.140563 0.948702 0.690226 0.523174 0.493019 ; true_not_amb% 96.664226 94.998014 95.035828 95.170331 95.340733 95.380462 ; precision 1.000000 0.391114 0.225586 0.187663 0.165031 0.161089 ; recall 1.000000 0.641699 0.284402 0.206916 0.156837 0.147797 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 32508.428571 17310.000000 5496.571429 3359.000000 2526.428571 2072.000000 ; rules% 15.796631 8.411347 2.670917 1.632219 1.227653 1.006835 ; same% 82.391951 82.867597 83.128287 82.423965 81.788246 81.152527 ; ambi1% 1.820261 1.637320 1.157100 0.992454 0.905557 0.882689 ; ambi2% 1.152527 1.257718 0.905557 0.896410 0.800366 0.740910 ; ambi3% 0.000000 0.000000 0.013721 0.022868 0.027441 0.022868 ; diff% 14.635262 14.237366 14.795335 15.664304 16.478390 17.201006 ; same%stdev 2.582385 2.424247 2.503199 2.565969 2.981989 3.166524 ; ambi1%stdev 0.429031 0.437603 0.387519 0.381125 0.360757 0.361939 ; ambi2%stdev 0.186043 0.227897 0.258141 0.254203 0.263215 0.220396 ; ambi3%stdev 0.000000 0.000000 0.017233 0.015459 0.020653 0.023874 ; diff%stdev 2.167391 2.029887 2.323520 2.366969 2.756033 2.956127 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 3.791448 3.503316 2.680082 2.446833 2.204436 2.112966 ; false_amb% 0.310999 0.297279 0.214955 0.210382 0.150926 0.118912 ; false_not_amb% 0.160073 0.169220 0.196661 0.196661 0.196661 0.196661 ; true_amb% 0.091470 0.082323 0.054882 0.054882 0.054882 0.054882 ; true_not_amb% 13.487308 13.501029 13.583352 13.587926 13.647382 13.679396 ; precision 0.128205 0.121622 0.113208 0.115385 0.153846 0.187500 ; recall 0.363636 0.327273 0.218182 0.218182 0.218182 0.218182 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 1.410*N^0.820 0.869*N^0.808 0.365*N^0.784 0.274*N^0.767 0.234*N^0.756 0.189*N^0.757 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %