; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i wfl-mk.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D 0.0057774324;-0.6732268078;0.7362387698;0.0273332853;0.0059725934;0.0624658307; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X S ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.037777 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 73995 ; Number of lines found in word/lemma list: 1323572 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 107080.000000 59077.000000 19424.000000 11029.000000 8014.000000 6419.000000 ; rules% 8.521963 4.701644 1.545859 0.877743 0.637794 0.510856 ; same% 96.877243 94.357502 92.636476 91.941301 91.594549 91.280666 ; ambi1% 1.533126 1.442399 1.005000 0.831186 0.709421 0.657691 ; ambi2% 1.533126 1.309571 0.808822 0.649493 0.541735 0.492790 ; ambi3% 0.056505 0.025228 0.003900 0.001910 0.001990 0.002149 ; diff% 0.000000 2.865299 5.545802 6.576110 7.152305 7.566704 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 3.122757 2.878829 1.972515 1.627593 1.389634 1.281478 ; false_amb% 0.000000 0.831345 0.912601 0.857847 0.774442 0.744757 ; false_not_amb% 0.000000 1.075273 2.062844 2.353010 2.507565 2.586035 ; true_amb% 3.122757 2.047484 1.059913 0.769746 0.615192 0.536721 ; true_not_amb% 96.877243 96.045898 95.964642 96.019396 96.102802 96.132487 ; precision 1.000000 0.551857 0.367373 0.309702 0.284275 0.264886 ; recall 1.000000 0.655665 0.339416 0.246496 0.197003 0.171874 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 105097.666667 57832.333333 19121.000000 10943.666667 7951.333333 6395.666667 ; rules% 8.484210 4.668625 1.543579 0.883448 0.641887 0.516302 ; same% 89.581966 89.876409 90.221489 90.390278 90.257122 90.213987 ; ambi1% 1.365316 1.290298 1.119634 0.864575 0.761426 0.680782 ; ambi2% 0.887080 0.937717 0.750173 0.615142 0.564506 0.421973 ; ambi3% 0.001875 0.011253 0.000000 0.003751 0.001875 0.003751 ; diff% 8.163763 7.884323 7.908704 8.126254 8.415071 8.679507 ; same%stdev 0.801935 0.541525 0.566555 0.483612 0.559456 0.700296 ; ambi1%stdev 0.058864 0.072778 0.124276 0.070634 0.099749 0.154899 ; ambi2%stdev 0.117521 0.068490 0.046859 0.032305 0.048497 0.090107 ; ambi3%stdev 0.003377 0.014809 0.000000 0.003311 0.003377 0.003311 ; diff%stdev 0.638506 0.528775 0.435125 0.445401 0.468405 0.467456 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 2.539337 2.464320 2.057351 1.663510 1.479717 1.260291 ; false_amb% 0.598263 0.600139 0.506367 0.431350 0.345080 0.279440 ; false_not_amb% 0.849571 0.812063 0.885205 0.924589 0.958347 0.973350 ; true_amb% 0.210049 0.247557 0.174415 0.135031 0.101273 0.086270 ; true_not_amb% 32.872602 32.870726 32.964498 33.039515 33.125785 33.191426 ; precision 0.149333 0.170984 0.146919 0.135338 0.127962 0.133721 ; recall 0.198230 0.233628 0.164602 0.127434 0.095575 0.081416 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 0.768*N^0.843 0.498*N^0.831 0.406*N^0.760 0.538*N^0.697 0.769*N^0.646 0.963*N^0.614 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %