; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i wfl-ro.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D -0.021046;-0.659149;0.741112;-0.011371;-0.005483;0.125192; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X E ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.046708 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 32593 ; Number of lines found in word/lemma list: 428191 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 50221.000000 27816.000000 9324.000000 5646.000000 4177.000000 3325.000000 ; rules% 13.645231 7.557710 2.533365 1.534039 1.134906 0.903415 ; same% 91.462255 86.128711 82.825610 81.589630 80.918521 80.406632 ; ambi1% 4.242925 5.570469 5.482980 5.413968 5.366963 5.333815 ; ambi2% 4.242925 5.105856 4.436106 4.225808 4.058981 3.922586 ; ambi3% 0.051895 0.029072 0.009510 0.006249 0.008423 0.007608 ; diff% 0.000000 3.165891 7.245794 8.764346 9.647111 10.329359 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 8.537745 11.051004 10.818969 10.670619 10.540473 10.464668 ; false_amb% 0.000000 3.839173 4.924901 5.162642 5.247142 5.304471 ; false_not_amb% 0.000000 1.325914 2.643677 3.029768 3.244414 3.377549 ; true_amb% 8.537745 7.211831 5.894068 5.507977 5.293331 5.160196 ; true_not_amb% 91.462255 87.623082 86.537354 86.299613 86.215113 86.157784 ; precision 1.000000 0.484335 0.374372 0.347874 0.335284 0.327234 ; recall 1.000000 0.844700 0.690354 0.645133 0.619992 0.604398 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 49602.666667 27481.500000 9236.000000 5570.833333 4124.833333 3303.000000 ; rules% 13.677759 7.577926 2.546794 1.536137 1.137408 0.910791 ; same% 78.042377 78.984433 79.617618 79.447739 79.191376 79.101804 ; ambi1% 6.557326 5.948851 5.534964 5.337287 5.256980 5.176674 ; ambi2% 5.253892 4.985174 4.404497 4.231530 4.070917 4.015320 ; ambi3% 0.012355 0.015444 0.015444 0.012355 0.006177 0.006177 ; diff% 10.134050 10.066098 10.427477 10.971090 11.474549 11.700025 ; same%stdev 1.571077 1.478458 1.192298 1.148966 1.294703 1.422443 ; ambi1%stdev 0.274900 0.277091 0.173688 0.205647 0.181220 0.165618 ; ambi2%stdev 0.269331 0.302569 0.305061 0.291559 0.304873 0.262022 ; ambi3%stdev 0.015102 0.013999 0.014147 0.015401 0.015216 0.015216 ; diff%stdev 1.240627 1.249084 1.186992 1.025332 1.089813 1.189899 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 13.306153 12.438226 11.267606 10.878428 10.683840 10.646775 ; false_amb% 1.155177 1.081048 0.904991 0.864838 0.812330 0.809241 ; false_not_amb% 0.413887 0.407709 0.416976 0.435508 0.466395 0.491105 ; true_amb% 0.886459 0.892637 0.883370 0.864838 0.833951 0.809241 ; true_not_amb% 13.652088 13.726217 13.902273 13.942426 13.994935 13.998023 ; precision 0.277295 0.292214 0.327982 0.333333 0.339196 0.333333 ; recall 0.681710 0.686461 0.679335 0.665083 0.641330 0.622328 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 0.766*N^0.863 0.469*N^0.857 0.220*N^0.828 0.205*N^0.794 0.194*N^0.775 0.169*N^0.770 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %