; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i wfl-sk.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D 0.021767;-0.480966;0.778614;0.266345;0.021757;0.300908; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X C ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 0.010466 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 71381 ; Number of lines found in word/lemma list: 1910872 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 70835.000000 40982.000000 15514.000000 9904.000000 7487.000000 6140.000000 ; rules% 7.571555 4.380567 1.658292 1.058639 0.800286 0.656305 ; same% 97.357572 95.010374 93.165024 92.335023 91.753328 91.326302 ; ambi1% 1.287597 1.307479 1.054363 0.984243 0.927164 0.877567 ; ambi2% 1.287597 1.182204 0.859182 0.785855 0.756140 0.726318 ; ambi3% 0.067234 0.037732 0.014644 0.009299 0.006307 0.004062 ; diff% 0.000000 2.462212 4.906787 5.885579 6.557062 7.065751 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 2.642428 2.621264 2.080614 1.941123 1.855183 1.770313 ; false_amb% 0.000000 0.606387 0.694892 0.694678 0.690189 0.665177 ; false_not_amb% 0.000000 0.627551 1.256706 1.395984 1.477434 1.537292 ; true_amb% 2.642428 2.014877 1.385722 1.246445 1.164994 1.105136 ; true_not_amb% 97.357572 96.751185 96.662680 96.662893 96.667383 96.692395 ; precision 1.000000 0.624255 0.499268 0.472890 0.457691 0.453763 ; recall 1.000000 0.762510 0.524412 0.471704 0.440880 0.418227 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 70020.000000 40545.333333 15391.666667 9819.000000 7406.000000 6061.333333 ; rules% 7.596173 4.398591 1.669777 1.065222 0.803446 0.657568 ; same% 91.752053 91.667272 91.710874 91.347528 91.018095 90.715307 ; ambi1% 1.099726 1.092459 0.971344 0.927743 0.903520 0.864763 ; ambi2% 0.862340 0.891408 0.818739 0.777560 0.738803 0.748492 ; ambi3% 0.012112 0.009689 0.004845 0.004845 0.009689 0.009689 ; diff% 6.273769 6.339171 6.494199 6.942325 7.329894 7.661749 ; same%stdev 0.850446 0.626816 0.565834 0.508730 0.547719 0.594312 ; ambi1%stdev 0.076750 0.102915 0.042559 0.039049 0.065533 0.084288 ; ambi2%stdev 0.124920 0.152555 0.170145 0.178862 0.108661 0.109451 ; ambi3%stdev 0.010992 0.008330 0.008360 0.008360 0.008439 0.008439 ; diff%stdev 0.671986 0.421705 0.364814 0.313048 0.426074 0.440122 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 2.158273 2.211564 1.979023 1.877286 1.814306 1.770705 ; false_amb% 0.336700 0.317322 0.256764 0.259187 0.251920 0.261609 ; false_not_amb% 0.632221 0.583775 0.583775 0.588620 0.598309 0.607998 ; true_amb% 0.363346 0.411792 0.411792 0.406947 0.397258 0.387569 ; true_not_amb% 32.364412 32.383790 32.444348 32.441925 32.449192 32.439503 ; precision 0.350467 0.393519 0.445026 0.439791 0.440860 0.425532 ; recall 0.364964 0.413625 0.413625 0.408759 0.399027 0.389294 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 1.301*N^0.793 0.737*N^0.795 0.258*N^0.800 0.142*N^0.810 0.128*N^0.795 0.128*N^0.780 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %