; verbose (-v: yes -v-: no) -v - ; keep intermediary files (-x: yes -x-: no) -x - ; flex rules (input file, binary format) ;-b (not specified, not doing actions with already created flex rules.) ; Word list (Optional if -b is specified. Otherwise N/A) (-I filename) ;-I (N/A) ; Output, lemmas of words in input (-I option) ;-O (N/A) ; word/lemma list -i wfl-fa.txt.ph ; extra file name affix -e ziggurat ; suffix only (-s: yes -s-: no) -s - ; make rules with infixes less prevalent(-A: yes -A-: no) -A - ; columns (1 or F or W=word,2 or B or L=lemma,3 or T=tags,0 or O=other) -n FBO ; max recursion depth when attempting to create candidate rule -Q 1 ; flex rules (output, binary format, can be left unspecified) ;-o (Not specified, autogenerated) ; temp dir (including separator at end!) -j tmp/ ; penalties to decide which rule survives (4 or 6 floating point numbers: R=>R;W=>R;R=>W;W=>W[;R=>N/A;W=>NA], where R=#right cases, W=#wrong cases, N/A=#not applicable cases, previous success state=>success state after rule application) -D 0.0023118581;-0.6790274640;0.7275921544;0.0149554752;0.0028828714;0.0964055916; ; compute parms (-p: yes -p-: no) -p ; expected optimal pruning threshold (only effective in combination with -XW) -C -1 ; tree penalty (-XC: constant -XD: more support is better -XE: higher entropy is better -XW: Fewer pattern characters other than wildcards is better) -X C ; current parameters (-P filename) -P parms.txt ; best parameters (-B filename) -B best_ziggurat.txt ; start training with minimal fraction of training pairs (-Ln: 0.0 < n <= 1.0) -L 1.000000 ; end training with maximal fraction of training pairs (-Hn: 0.0 < n <= 1.0) -H 1.000000 ; number of differently sized fractions of trainingdata (natural number) -K 20 ; number of iterations of training with same fraction of training data when fraction is minimal (positive number) -N 100.000000 ; number of iterations of training with same fraction of training data when fraction is maximal (positive number) -M 10.000000 ; competition function (deprecated) ;-f (N/A) ; redo training after homographs for next round are removed (-R: yes -R-: no) ;-R - (N/A) ; max. pruning threshold to evaluate -c 5 ; test with the training data (-T: yes -T-: no) -T ; test with data not used for training (-t: yes -t-: no) -t ; create flexrules using full training set (-F: yes -F-: no) -F ; Number of clusters found in word/lemma list: 6356 ; Number of lines found in word/lemma list: 13006 ; Evaluation: ; ----------- ; Lemmatization results for all data in the training set. ; For pruning threshold 0 there may be no errors (diff%%). ; prun. thrshld. 0 1 2 3 4 5 ; rules 1971.000000 1110.000000 316.000000 175.000000 118.000000 95.000000 ; rules% 17.107890 9.634580 2.742817 1.518965 1.024217 0.824581 ; same% 96.363163 89.974829 85.365854 83.768770 83.221943 82.527558 ; ambi1% 1.770680 2.334867 1.762000 1.718601 1.267251 1.301970 ; ambi2% 1.770680 2.074473 1.501606 1.423488 1.189133 1.345369 ; ambi3% 0.095478 0.008680 0.000000 0.000000 0.000000 0.000000 ; diff% 0.000000 5.607152 11.370541 13.089142 14.321673 14.825102 ; same%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi1%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi2%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 3.636837 4.582936 3.524000 3.489280 2.716778 2.977172 ; false_amb% 0.000000 2.274108 2.360906 2.656019 2.126552 2.456384 ; false_not_amb% 0.000000 1.328010 2.473744 2.803576 3.046611 3.116049 ; true_amb% 3.636837 2.308827 1.163093 0.833261 0.590227 0.520788 ; true_not_amb% 96.363163 94.089055 94.002257 93.707143 94.236611 93.906779 ; precision 1.000000 0.336709 0.197640 0.135593 0.121864 0.095847 ; recall 1.000000 0.634845 0.319809 0.229117 0.162291 0.143198 ; Evaluation: ; ----------- ; Lemmatization results for data that is not part of the training data. ; prun. thrshld. 0 1 2 3 4 5 ; rules 1944.533333 1097.700000 311.866667 172.200000 117.266667 94.200000 ; rules% 17.120737 9.664752 2.745845 1.516143 1.032480 0.829388 ; same% 80.804574 81.131305 81.805187 82.172759 81.927711 82.193179 ; ambi1% 1.756177 1.837860 1.409026 1.245661 1.123137 1.061875 ; ambi2% 1.572391 1.490709 1.184399 1.204819 1.184399 0.898509 ; ambi3% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff% 15.866857 15.540127 15.601389 15.376761 15.764754 15.846437 ; same%stdev 4.067216 3.899318 3.425927 3.590445 3.699449 3.734649 ; ambi1%stdev 1.194977 1.126517 0.884460 0.803959 0.965649 0.871877 ; ambi2%stdev 0.706891 0.727104 0.827304 0.798081 0.784885 0.708120 ; ambi3%stdev 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ; diff%stdev 3.566288 3.734719 3.484871 3.693376 3.925267 3.989085 ; ;Evaluation of prediction of ambiguity (whether a word has more than one possible lemma) ;--------------------------------------------------------------------------------------- ; amb.rules% 3.716561 3.634878 2.777211 2.675107 2.450480 2.144170 ; false_amb% 0.122524 0.142945 0.081683 0.061262 0.040841 0.000000 ; false_not_amb% 0.245048 0.245048 0.245048 0.245048 0.245048 0.326731 ; true_amb% 0.081683 0.081683 0.081683 0.081683 0.081683 0.000000 ; true_not_amb% 3.328569 3.308148 3.369410 3.389831 3.410251 3.451093 ; precision 0.250000 0.222222 0.333333 0.400000 0.500000 0.000000 ; recall 0.250000 0.250000 0.250000 0.250000 0.250000 0.000000 ; ; Power law relating the number of rules in the decision tree to the number of examples in the training data ;---------------------------------------------------------------------------------------------------------- ; #rules = 0.640*N^0.856 0.473*N^0.825 0.166*N^0.807 0.108*N^0.794 0.085*N^0.786 0.069*N^0.787 ; Postscriptum ; The number of rules can be estimated from the number of training examples by ; a power law. See the last line in the table above, which is based on 7 ; different samples from the total available training data mass varying in size ; from 1.54 % to 98.56 %