; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i frlexambi.uden.dubletter.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D -0.001161;-0.739382;0.661059;-0.127638;-0.000914;-0.004777; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X C ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.073391 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 45086 ; Number of lines found in word/lemma list: 272512 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 32620.000000 17663.000000 6397.000000 3537.000000 2603.000000 2109.000000 ; rules% 11.970115 6.481549 2.347420 1.297924 0.955187 0.773911 ; same% 91.447716 89.059197 87.637609 86.998004 86.815993 86.656000 ; ambi1% 4.217062 3.682774 2.731623 2.095321 1.695705 1.415717 ; ambi2% 4.217062 3.313616 2.119173 1.574243 1.254991 1.048027 ; ambi3% 0.118160 0.056144 0.024953 0.021650 0.017614 0.011743 ; diff% 0.000000 3.888269 7.486643 9.310783 10.215697 10.868512 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 8.552284 7.255093 5.241604 4.074683 3.333798 2.842444 ; false_amb% 0.000000 1.620112 2.019361 1.848726 1.661945 1.523970 ; false_not_amb% 0.000000 2.917303 5.330041 6.326327 6.880431 7.233810 ; true_amb% 8.552284 5.634981 3.222243 2.225957 1.671853 1.318474 ; true_not_amb% 91.447716 89.827604 89.428355 89.598990 89.785771 89.923746 ; precision 1.000000 0.634913 0.443776 0.375790 0.334656 0.301958 ; recall 1.000000 0.658886 0.376770 0.260276 0.195486 0.154166 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 32237.375000 17457.500000 6299.250000 3484.875000 2564.000000 2080.750000 ; rules% 12.003152 6.500065 2.345441 1.297546 0.954671 0.774739 ; same% 85.064440 85.673925 86.210399 86.537363 86.699257 86.680211 ; ambi1% 3.364866 2.933147 2.333185 1.860199 1.533236 1.333249 ; ambi2% 2.085582 1.904641 1.472922 1.104692 0.872960 0.834868 ; ambi3% 0.031744 0.025395 0.022221 0.000000 0.000000 0.000000 ; diff% 9.453368 9.462891 9.961272 10.497746 10.894546 11.151673 ; same%stdev 0.975230 0.834743 1.073108 1.062731 0.965157 1.060367 ; ambi1%stdev 0.490460 0.281130 0.288914 0.305493 0.226345 0.246269 ; ambi2%stdev 0.343064 0.309194 0.261062 0.128350 0.135261 0.177305 ; ambi3%stdev 0.036668 0.039571 0.020861 0.000000 0.000000 0.000000 ; diff%stdev 0.807046 0.923027 1.293536 1.262250 1.122537 1.162635 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 6.040886 5.348867 4.247349 3.352168 2.749032 2.501428 ; false_amb% 0.415847 0.358707 0.269824 0.212685 0.190464 0.174592 ; false_not_amb% 0.930100 0.930100 0.961844 0.999937 1.041204 1.041204 ; true_amb% 0.288871 0.288871 0.257127 0.219034 0.177766 0.177766 ; true_not_amb% 11.653228 11.710368 11.799251 11.856390 11.878611 11.894483 ; precision 0.257790 0.287066 0.322709 0.339901 0.318182 0.337349 ; recall 0.236979 0.236979 0.210938 0.179688 0.145833 0.145833 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 0.830*N^0.844 0.612*N^0.821 0.270*N^0.803 0.252*N^0.762 0.246*N^0.740 0.217*N^0.733 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %