; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i morph-it_048_utf8.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D 0.0006500449;-0.7124017300;0.6974840179;-0.0751288207;0.0006206730;0.0188327467; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X C ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.121008 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 25154 ; Number of lines found in word/lemma list: 413196 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 42595.000000 23858.000000 7958.000000 4609.000000 3385.000000 2668.000000 ; rules% 12.465066 6.981842 2.328841 1.348785 0.990592 0.780768 ; same% 94.523799 89.828073 87.312819 86.536734 86.002956 85.611401 ; ambi1% 2.657478 4.324949 4.111321 3.698989 3.529842 3.415712 ; ambi2% 2.657185 3.445561 2.589000 2.165255 2.028006 1.875540 ; ambi3% 0.161538 0.152759 0.082525 0.067015 0.057358 0.054431 ; diff% 0.000000 2.248658 5.904335 7.532008 8.381839 9.042916 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 5.475908 8.228202 7.580586 6.873272 6.634476 6.440455 ; false_amb% 0.000000 3.795268 5.008852 5.051578 5.118300 5.096645 ; false_not_amb% 0.000000 1.042974 2.904175 3.654215 3.959733 4.132098 ; true_amb% 5.475908 4.432934 2.571734 1.821694 1.516176 1.343810 ; true_not_amb% 94.524092 90.728824 89.515239 89.472514 89.405791 89.427447 ; precision 1.000000 0.368690 0.204277 0.152764 0.129006 0.116477 ; recall 1.000000 0.809534 0.469645 0.332674 0.276881 0.245404 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 40748.166667 22815.000000 7729.500000 4463.500000 3262.833333 2577.333333 ; rules% 12.100271 6.774972 2.295295 1.325448 0.968907 0.765346 ; same% 85.126827 85.966739 86.171678 86.091047 86.158240 85.876029 ; ambi1% 4.065177 3.551151 3.228624 3.087519 2.959852 2.801949 ; ambi2% 2.203931 1.925080 1.847808 1.720141 1.498404 1.505123 ; ambi3% 0.036956 0.033597 0.026877 0.016798 0.026877 0.026877 ; diff% 8.567109 8.523434 8.725013 9.084495 9.356627 9.790022 ; same%stdev 1.584252 1.642895 1.615913 1.974112 1.939858 1.860347 ; ambi1%stdev 1.098250 1.268623 1.295578 1.631147 1.732428 1.501260 ; ambi2%stdev 0.712405 0.658786 0.701844 0.775722 0.746973 0.788450 ; ambi3%stdev 0.025199 0.023550 0.038385 0.026506 0.028050 0.020535 ; diff%stdev 0.999256 0.995579 1.087194 1.019289 1.136911 1.100546 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 7.579372 6.658827 6.094406 5.724845 5.358643 5.197379 ; false_amb% 1.115404 0.960860 0.920544 0.883588 0.900386 0.876869 ; false_not_amb% 0.651772 0.675290 0.745842 0.769360 0.776079 0.796237 ; true_amb% 0.299009 0.275491 0.204939 0.181421 0.174702 0.154544 ; true_not_amb% 15.272972 15.427516 15.467831 15.504788 15.487989 15.511507 ; precision 0.118194 0.125382 0.100164 0.093103 0.088435 0.080986 ; recall 0.314488 0.289753 0.215548 0.190813 0.183746 0.162544 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 0.728*N^0.859 0.529*N^0.839 0.262*N^0.806 0.265*N^0.761 0.311*N^0.722 0.304*N^0.707 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %