Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
EST_PST.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1996 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Alan W Black */
34/* Date : July 1996 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* A general class for PredictionSuffixTrees */
38/* */
39/*=======================================================================*/
40
41#include <cstdlib>
42#include <iostream>
43#include <fstream>
44#include <cstring>
45#include "EST_String.h"
46#include "EST_types.h"
47#include "EST_Token.h"
48#include "EST_PST.h"
49#include "EST_multistats.h"
50
52
53// Out of vocabulary identifier
54const EST_String PredictionSuffixTree_oov("_OOV_");
55const EST_DiscreteProbDistribution PSTnullProbDistribution;
56
57void slide(EST_IVector &i, const int l);
58void slide(EST_StrVector &i, const int l);
59
60EST_PredictionSuffixTree_tree_node::~EST_PredictionSuffixTree_tree_node()
61{
62}
63
64void
65EST_PredictionSuffixTree_tree_node::print_freqs(ostream &os)
66{
67 // Print out path and frequency for each leaf
68
69 if (p_level == 0)
70 {
71 // Base -- print from pd
72 EST_String s;
73 double freq;
74 EST_Litem *i;
75 for (i = pd.item_start();
76 !pd.item_end(i);
77 i=pd.item_next(i))
78 {
79 pd.item_freq(i,s,freq);
80 os << get_path() << " " << s << " : " << freq << endl;
81 }
82 }
83 else
84 {
86 for (t.begin(nodes); t; t++)
87 pstnode(t->v)->print_freqs(os);
88 }
89}
90
91void
92EST_PredictionSuffixTree_tree_node::print_probs(ostream &os)
93{
94 // Print out path and probability distributions for node above leafs
95
96 if (p_level == 0)
97 {
98 // Base -- print from pd
99 EST_String s;
100 double prob;
101 os << get_path() << " :";
102 for (EST_Litem *i = pd.item_start(); !pd.item_end(i) ; i=pd.item_next(i))
103 {
104 pd.item_prob(i,s,prob);
105 os << " " << s << " " << prob;
106 }
107 os << endl;
108 }
109 else
110 {
112 for (t.begin(nodes); t; t++)
113 pstnode(t->v)->print_probs(os);
114 }
115}
116
117const EST_String &
118EST_PredictionSuffixTree_tree_node::most_probable(double *prob) const
119{
120 // Find most probable symbol in this node
121 return pd.most_probable(prob);
122}
123
124EST_PredictionSuffixTree::EST_PredictionSuffixTree(void)
125{
126 p_order = 0;
127 num_states = 0;
128 nodes = 0;
129 pd = 0;
130}
131
132void
133EST_PredictionSuffixTree::init(const int order)
134{
135 p_order = order;
136 num_states = 0;
138 nodes->set_level(order-1);
140}
141
142EST_PredictionSuffixTree::~EST_PredictionSuffixTree()
143{
144 delete nodes;
145 delete pd;
146}
147
148void
149EST_PredictionSuffixTree::clear(void)
150{
151 delete nodes;
152 delete pd;
153 pd = 0;
154}
155
157EST_PredictionSuffixTree::p_prob_dist(EST_PredictionSuffixTree_tree_node *node,
158 const EST_StrVector &words,
159 const int index) const
160{
161 // Return the probability distribution for this EST_PredictionSuffixTree context
162
163 if (words.n() == index+1){
164 return node->prob_dist();
165 }else
166 {
169 next = pstnode(node->nodes.f(words(index),est_val(n)));
170 if (next == 0)
171 {
172 //cerr << "EST_PredictionSuffixTree: "
173 //<< "no EST_PredictionSuffixTree probabilities for context \n";
174 return PSTnullProbDistribution;
175 }
176 return p_prob_dist(next,words,index+1);
177 }
178}
179
180
181void
182EST_PredictionSuffixTree::accumulate(const EST_StrVector &words,
183 const double count,
184 const int index)
185{
186
187/*
188 cerr << "accumulate ";
189 for (int i=0; i < p_order-1; i++)
190 cerr << words(i+index) << " ";
191 cerr << count << endl;
192 */
193
194 if (words.n()+index < p_order)
195 cerr << "EST_PredictionSuffixTree: accumulating window is too small"
196 << endl;
197 else
198 {
199 pd->cumulate(words(p_order-1+index),count); // a little extra book-keeping
200 p_accumulate(nodes,words,count,index);
201 }
202}
203
204void
205EST_PredictionSuffixTree::p_accumulate(EST_PredictionSuffixTree_tree_node *node,
206 const EST_StrVector &words,
207 double count,
208 const int index)
209{
210 /* Expand tree with new gram */
211 if (words.n() == index+1)
212 {
213 if (node->prob_dist().samples() == 0) // A new terminal node
214 node->set_state(num_states++);
215 node->cumulate(words(index),count);
216 }
217 else
218 {
221 next = pstnode(node->nodes.f(words(index),est_val(n)));
222 if (next == 0)
223 {
225 if (node->get_path() == "")
226 next->set_path(words(index));
227 else
228 {
229 next->set_path(node->get_path()+" "+ words(index));
230 }
231 next->set_level(node->get_level()-1);
232 node->nodes.set_val(words(index),est_val(next));
233 }
234 p_accumulate(next,words,count,index+1);
235 }
236}
237
238double
239EST_PredictionSuffixTree::rev_prob(const EST_StrVector &words) const
240{
241 // Reverse probability. What is prob of this context given predictee
242 const EST_DiscreteProbDistribution &pg = prob_dist(words);
243 double d1 = pg.frequency(words(order()-1));
244 double d2 = pd->frequency(words(order()-1));
245 double d3 = d1/d2;
246 return d3;
247}
248
249double
250EST_PredictionSuffixTree::rev_prob(const EST_StrVector &words,
251 const EST_DiscreteProbDistribution &pg) const
252{
253 // Reverse probability. What is prob of this context given predictee
254 return (double)pg.frequency(words(order()-1)) /
255 pd->frequency(words(order()-1));
256}
257
258const
259EST_String &EST_PredictionSuffixTree::predict(const EST_StrVector &words) const
260{
261 double p;
262 int state;
263 return ppredict(nodes,words,&p,&state);
264}
265
266const
267EST_String &EST_PredictionSuffixTree::predict(const EST_StrVector &words,double *p) const
268{
269 int state;
270 return ppredict(nodes,words,p,&state);
271}
272
273const
274EST_String &EST_PredictionSuffixTree::predict(const EST_StrVector &words,double *p, int *state) const
275{
276 return ppredict(nodes,words,p,state);
277}
278
279const
280EST_String &EST_PredictionSuffixTree::ppredict(EST_PredictionSuffixTree_tree_node *node,
281 const EST_StrVector &words,
282 double *p,int *state,
283 const int index) const
284{
285 /* Given the context whats the most probably symbol (or OOV) */
286 if (words.n() == index+1)
287 {
288 *state = node->get_state();
289 return node->most_probable(p);
290 }
291 else
292 {
295 next = pstnode(node->nodes.f(words(index),est_val(n)));
296 if (next == 0)
297 {
298 *p = 0.0;
299 *state = 0; // No not really acceptable
300 return PredictionSuffixTree_oov; /* utterly improbable -- i.e. not in EST_PredictionSuffixTree */
301 }
302 else
303 return ppredict(next,words,p,state,index+1);
304 }
305}
306
307void
308EST_PredictionSuffixTree::print_freqs(ostream &os)
309{
310 // Print a list of all PredictionSuffixTrees with their frequencies
311
312 os << "EST_PredictionSuffixTree order=" << p_order << endl;
313 nodes->print_freqs(os);
314
315}
316
317void
318EST_PredictionSuffixTree::print_probs(ostream &os)
319{
320 // Print a list of all grams(-last) with their probability distributions
321
322 os << "EST_PredictionSuffixTree " << p_order << endl;
323 nodes->print_probs(os);
324
325}
326
327int
328EST_PredictionSuffixTree::save(const EST_String filename, const EST_PredictionSuffixTree::EST_filetype type)
329{
330 // Save an EST_PredictionSuffixTree to file -- actual the grams and frequencies
331 (void)type;
332
333 if (filename == "-")
334 print_freqs(cout);
335 else
336 {
337 ofstream os(filename);
338 print_freqs(os);
339 }
340 return 0;
341}
342
343int
344EST_PredictionSuffixTree::load(const EST_String filename)
345{
346 // Load an EST_PredictionSuffixTree for given file
348 int i,order,freq;
349
350 clear();
351 if (ts.open(filename) != 0)
352 {
353 cerr << "EST_PredictionSuffixTree: failed to open \"" << filename << "\" for reading\n";
354 return -1;
355 }
356 ts.set_SingleCharSymbols(":");
357
358 if (ts.get() != "EST_PredictionSuffixTree") // magic number
359 {
360 cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" not an EST_PredictionSuffixTree\n";
361 return -1;
362 }
363
364 order = atoi(ts.get().string());
365 if ((order < 1) || (order > 10))
366 {
367 cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" has bad order\n";
368 return -1;
369 }
370 init(order);
371 //EST_String const* *window = new EST_String const*[order+1];
372 EST_StrVector window(order);
373 for (i=0; i<p_order; i++)
374 window[i] = "";
375
376 while (!ts.eof())
377 {
378 slide(window,-1);
379 window[p_order-1] = ts.get().string();
380 if (ts.get() != ":")
381 {
382 cerr << "EST_PredictionSuffixTree: file \"" << filename << "\" missed parsed line ";
383 cerr << ts.linenum() << " near EST_PredictionSuffixTree\n";
384 for (i=0; i < order; i++)
385 cout << " " << window(i);
386 cout << endl;
387 }
388
389 freq = atoi(ts.get().string());
390 accumulate(window,freq);
391 }
392
393 return 0;
394}
395
396void
397EST_PredictionSuffixTree::build(const EST_String filename,
398 const EST_String prev,
399 const EST_String prev_prev,
400 const EST_String last)
401{
403 int i;
404
405 if (filename == "-")
406 ts.open(stdin, FALSE);
407 else if (ts.open(filename) == -1)
408 return;
409
410 //EST_String const* *window = new EST_String const*[p_order+1];
411 EST_StrVector window(p_order);
412 for (i=0; i<p_order-1; i++)
413 window[i] = prev_prev;
414 window[p_order-1] = prev;
415
416 accumulate(window,1);
417
418 //window[p_order] = 0; // end of array marker
419
420 while (!ts.eof())
421 {
422 slide(window,-1);
423 window[p_order-1] = ts.get().string();
424 accumulate(window,1);
425 }
426
427 // and finish off
428
429 slide(window,-1);
430 window[p_order-1] = last;
431 accumulate(window,1);
432
433}
434
435void
436EST_PredictionSuffixTree::build(const EST_StrList &input)
437{
438
439 // knackered - fix
440
441
442 // adapted from build(..) above
443 // but could be made more efficient
444
445 int i;
446 //EST_String const* *window = new EST_String const *[p_order+1];
447 EST_StrVector window(p_order);
448 for (i=0; i<p_order; i++)
449 window[i] = "";
450
452 for(i_ptr=input.head();i_ptr!=NULL;i_ptr=i_ptr->next())
453 {
454 slide(window,-1);
455 window[p_order-1] = input(i_ptr);
456 accumulate(window,1);
457 }
458}
459
460
461void
462EST_PredictionSuffixTree::test(const EST_String filename)
463{
464 // Run the current EST_PredictionSuffixTree over the given file name and generate
465 // statistics of its prediction power for the contents of that
466 // file. Use the confusion function
470 int i;
471
472 if (filename == "-")
473 ts.open(stdin, FALSE);
474 else if (ts.open(filename) == -1)
475 return;
476
477 /* The top level nodes list will always contain all the tokens */
478 /* Build the lexicon for confusion matrix */
480 for (p.begin(nodes->nodes); p; p++)
481 lex.append(p->k);
482 lex.append("_OOV_");
483
484 EST_StrVector window(p_order);
485 //EST_String const* *window = new EST_String const*[p_order+1];
486 for (i=0; i<p_order; i++)
487 window[i] = "";
488 double e=0;
489 int num_tsamples = 0;
490
491 while (!ts.eof())
492 {
493 slide(window,-1);
494 window[p_order-1] = ts.get().string();
495 const EST_DiscreteProbDistribution &pdist = prob_dist(window);
496 e += pdist.entropy();
497 num_tsamples++;
498 // cumulate real and predicted
499 pairs.add_item(window(p_order-1),predict(window),1);
500 }
501
502 const EST_FMatrix &m = confusion(pairs,lex);
503 print_confusion(m,pairs,lex);
504 cout << "Mean entropy (?) is " << e/num_tsamples << endl;
505
506
507}
508
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
EST_Litem * item_start() const
Used for iterating through members of the distribution.
void item_prob(EST_Litem *idx, EST_String &s, double &prob) const
During iteration returns name and probability given index.
const EST_String & most_probable(double *prob=NULL) const
Return the most probable member of the distribution.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
K k
The key.
Definition EST_THash.h:78
V v
The value.
Definition EST_THash.h:80
void begin(const Container &over)
Set the iterator ready to run over this container.