; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i spanish.txt.learn.flat.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D -0.0005687963;-0.6946223877;0.7166924688;-0.0549098036;-0.0003540120;0.0289155389; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X C ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.308231 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 19614 ; Number of lines found in word/lemma list: 162216 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 18456.000000 10401.000000 3478.000000 1947.000000 1367.000000 1080.000000 ; rules% 11.483111 6.471382 2.163972 1.211401 0.850532 0.671964 ; same% 98.955968 94.845168 91.480995 90.183732 89.378620 88.847894 ; ambi1% 0.518283 0.711784 0.498995 0.332249 0.280607 0.307983 ; ambi2% 0.518283 0.638988 0.413755 0.276874 0.233943 0.230210 ; ambi3% 0.007466 0.001244 0.000000 0.000000 0.000000 0.000000 ; diff% 0.000000 3.802816 7.606254 9.207145 10.106830 10.613913 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 1.044032 1.432278 1.098163 0.740404 0.636499 0.708673 ; false_amb% 0.000000 0.813823 0.851776 0.630899 0.573036 0.643965 ; false_not_amb% 0.000000 0.425577 0.797646 0.934527 0.980569 0.979325 ; true_amb% 1.044032 0.618455 0.246387 0.109505 0.063463 0.064708 ; true_not_amb% 98.955968 98.142145 98.104192 98.325069 98.382932 98.312003 ; precision 1.000000 0.275346 0.126356 0.079855 0.052469 0.047838 ; recall 1.000000 0.592372 0.235995 0.104887 0.060787 0.061979 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 18198.800000 10254.400000 3445.600000 1911.500000 1355.200000 1083.300000 ; rules% 11.493945 6.476444 2.176162 1.207260 0.855913 0.684187 ; same% 88.883308 89.439980 89.444165 88.979575 88.824711 88.640549 ; ambi1% 0.560857 0.414365 0.263687 0.184162 0.167420 0.121380 ; ambi2% 0.405994 0.397623 0.284614 0.171606 0.154864 0.133936 ; ambi3% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff% 10.149841 9.748033 10.007534 10.664658 10.853005 11.104135 ; same%stdev 2.308541 2.167692 2.075796 2.128960 2.102096 2.101237 ; ambi1%stdev 0.146085 0.122366 0.157993 0.116361 0.080168 0.101563 ; ambi2%stdev 0.171742 0.223117 0.205069 0.132883 0.078511 0.091052 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 2.342525 2.226816 2.084668 2.218198 2.137881 2.158838 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 1.180311 0.979407 0.719906 0.472962 0.405994 0.389252 ; false_amb% 0.121380 0.087896 0.096267 0.054412 0.058597 0.075339 ; false_not_amb% 0.066968 0.075339 0.058597 0.075339 0.075339 0.075339 ; true_amb% 0.008371 0.000000 0.016742 0.000000 0.000000 0.000000 ; true_not_amb% 8.927675 8.961159 8.952788 8.994643 8.990457 8.973715 ; precision 0.033333 0.000000 0.080000 0.000000 0.000000 0.000000 ; recall 0.111111 0.000000 0.222222 0.000000 0.000000 0.000000 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 0.765*N^0.840 0.496*N^0.828 0.263*N^0.786 0.279*N^0.731 0.256*N^0.711 0.271*N^0.688 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %