Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
dlist.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996,1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : February 1997 */
35/*-----------------------------------------------------------------------*/
36/* Decision lists */
37/* These are like a right branching decision tree, yes answers are not */
38/* partitioned any further. These are used for the Yarowsky-style */
39/* homograph disambiguators */
40/* */
41/* Yarowsky, D. "Homograph Disambiguation in Text-to_Speech Synthesis" */
42/* in "Progress in Speech Synthesis" eds van Santen, J. et al. Springer */
43/* pp 157-172, 1996 */
44/* Rivest, R.L. "Learning decision lists", Machine Learning 2:229-246 */
45/* 1987 */
46/* */
47/*=======================================================================*/
48#include <iostream>
49#include <cstring>
50#include "EST_Wagon.h"
51#include "EST_FMatrix.h"
52#include "EST_multistats.h"
53
54static WDlist *wagon_decision_list();
55static void do_dlist_summary(WDlist *dlist, WDataSet &dataset);
56static WDlist *dlist_score_question(WQuestion &q, WVectorList &ds);
57
58WNode *wgn_build_dlist(float &score,ostream *output)
59{
61
62 dlist = wagon_decision_list();
63
64 *output << *dlist;
65
66 if (wgn_test_dataset.width() > 0) // load in test data
67 do_dlist_summary(dlist,wgn_test_dataset);
68 else
69 do_dlist_summary(dlist,wgn_dataset);
70
71 score = 0;
72 return 0;
73}
74
75static void do_dlist_summary(WDlist *dlist, WDataSet &dataset)
76{
77 // Test dlist against data to get summary of results
80 EST_Litem *p;
81 EST_String predict,real;
82 int i,type;
83
84 for (p=dataset.head(); p != 0; p=p->next())
85 {
86 predict = (EST_String)dlist->predict(*dataset(p));
87 type = dataset.ftype(0);
88 real = wgn_discretes[type].name(dataset(p)->get_int_val(0));
89 pairs.add_item(real,predict,1);
90 }
91 for (i=0; i<wgn_discretes[dataset.ftype(0)].length(); i++)
92 lex.append(wgn_discretes[dataset.ftype(0)].name(i));
93
94 const EST_FMatrix &m = confusion(pairs,lex);
95 print_confusion(m,pairs,lex);
96
97}
98
99static WDlist *wagon_decision_list()
100{
101 // For each possible question, measure it using
102 // yarowsky's loglikelihood ratio
103 // Abs(log(P(class_1|question_j)/P(class_2|question_j)))
104 // Output it for external sorting
105 // Hmm this implies binary classes
106 int i,cl;
108 WDlist *dlist, *d;
109
110 dlist=0;
111 for (i=1;i < wgn_dataset.width(); i++)
112 {
113 if (wgn_dataset.ftype(i) == wndt_ignore)
114 continue;
115 else if (wgn_dataset.ftype(i) >= wndt_class)
116 {
117 ques.set_fp(i);
118 ques.set_oper(wnop_is);
119 for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(i)].length(); cl++)
120 {
121 ques.set_operand1(EST_Val(cl));
122 d = dlist_score_question(ques,wgn_dataset);
123 if (d != 0)
124 dlist = add_to_dlist(dlist,d);
125 }
126 }
127 }
128
129 return dlist;
130}
131
132static WDlist *dlist_score_question(WQuestion &q, WVectorList &ds)
133{
134 // score this question for decision lists
135 // the sum of the impurities when ds is split with this question
136 WImpurity y;
137 EST_Litem *d;
138 WVector *wv;
139 int i;
140
141 for (i=0,d=ds.head(); d != 0; d=d->next(),i++)
142 {
143 wv = ds(d);
144 if (q.ask(*wv) == TRUE)
145 y.cumulate((*wv)[0]);
146 }
147
148 if (y.samples() > wgn_min_cluster_size)
149 {
150 q.set_yes((int)y.samples());
151
153
154 // Generalizing the formula in Yarowsky (pp157-172) we modify it
155 // to take absolute log-likelihood ration of the most probability
156 // of the most probable over the rest
157 EST_String t;
158 double p;
159 double f;
160 WDlist *n = new WDlist;
161 n->set_question(q);
162
163 t = pd.most_probable(&p);
164 f = pd.frequency(t);
165 n->set_score(fabs(log((f+0.0001)/(pd.samples()-f+0.0001))));
166 n->set_best(t,(int)f,(int)pd.samples());
167
168#if 0 // original two-case code
169 int freqa, freqb;
170 pd.item_freq(0,s,freqa);
171 pd.item_freq(1,s,freqb);
172
173 n->set_score(fabs(log((0.0001+freqa)/(0.0001+freqb))));
174 n->set_freqs(freqa,freqb);
175#endif
176 return n;
177 }
178
179 return 0;
180}
181
182EST_Val WDlist::predict(const WVector &d)
183{
184 if (p_question.ask(d))
185 return p_token;
186 else if (next == 0)
187 return "guess"; // should be a priori most probable as dlist can't help
188 else
189 return next->predict(d);
190}
191
192WDlist *add_to_dlist(WDlist *l, WDlist *a)
193{
194 // Add a to l at appropriate place in score order
195 WDlist *p,*lp;
196
197 if (l == 0)
198 return a;
199 else
200 {
201 for (lp=0,p=l; p != 0; lp=p,p=p->next)
202 {
203 if (a->score() > p->score())
204 {
205 a->next = p;
206 if (lp == 0)
207 return a;
208 else
209 {
210 lp->next = a;
211 return l;
212 }
213 }
214 }
215 // add to end
216 lp->next = a;
217 }
218 return l;
219}
220
222{
223 s << endl;
224 s << "(";
225 s << dlist.p_question;
226 s << " ((";
227 s << dlist.p_score;
228 s << " " << dlist.p_freq << " " << dlist.p_samples <<
229 " " << dlist.p_token << "))";
230 if (dlist.next != 0)
231 s << *dlist.next;
232 else
233 s << endl;
234 s << ")";
235
236 return s;
237}
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
double samples(void) const
Total number of example found.