Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
freqsmooth.cc
1/*************************************************************************/
2/* */
3/* Centre for Speech Technology Research */
4/* University of Edinburgh, UK */
5/* Copyright (c) 1997 */
6/* All Rights Reserved. */
7/* */
8/* Permission is hereby granted, free of charge, to use and distribute */
9/* this software and its documentation without restriction, including */
10/* without limitation the rights to use, copy, modify, merge, publish, */
11/* distribute, sublicense, and/or sell copies of this work, and to */
12/* permit persons to whom this work is furnished to do so, subject to */
13/* the following conditions: */
14/* 1. The code must retain the above copyright notice, this list of */
15/* conditions and the following disclaimer. */
16/* 2. Any modifications must be clearly marked as such. */
17/* 3. Original authors' names are not deleted. */
18/* 4. The authors' names are not used to endorse or promote products */
19/* derived from this software without specific prior written */
20/* permission. */
21/* */
22/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30/* THIS SOFTWARE. */
31/* */
32/*************************************************************************/
33/* Author : Paul Taylor & Simon King & Alan W Black */
34/* Date : April 1997 */
35/*-----------------------------------------------------------------------*/
36/* */
37/* Yet another method for smoothing an ngram */
38/* */
39/* Good_Turing smooth n-grammar so there are no zero occurrences */
40/* For each ngram */
41/* if |n-1gram| < threshold */
42/* F(ngram) = !n-1gram|*(P(n-1gram)) */
43/* */
44/*=======================================================================*/
45#include <iostream>
46#include <cstring>
47#include <cmath>
48#include <climits>
49#include <cfloat>
50#include "EST_Ngrammar.h"
51
52static double fs_find_backoff_prob(EST_Ngrammar *backoff_ngrams,
53 int order,const EST_StrVector words,
54 int smooth_thresh);
55
56void Ngram_freqsmooth(EST_Ngrammar &ngram,int smooth_thresh1,
58{
59 // To assign reasonable frequencies for all ngrams in grammar
61 backoff_ngrams = new EST_Ngrammar[ngram.order()-1];
62
63 Good_Turing_smooth(ngram,smooth_thresh1,0);
64
65 fs_build_backoff_ngrams(backoff_ngrams,ngram);
66
67 fs_backoff_smooth(backoff_ngrams,ngram,smooth_thresh2);
68
69 delete [] backoff_ngrams;
70
71}
72
73void fs_build_backoff_ngrams(EST_Ngrammar *backoff_ngrams,
74 EST_Ngrammar &ngram)
75{
76 // Build all the backoff grammars back to uni-grams
77 int i,j,l;
78 EST_Litem *k;
79
80 for (i=0; i < ngram.order()-1; i++)
81 backoff_ngrams[i].init(i+1,EST_Ngrammar::dense,
82 *ngram.vocab,*ngram.pred_vocab);
83
84 for (i=0; i < ngram.num_states(); i++)
85 {
86 const EST_StrVector words = ngram.make_ngram_from_index(i);
87
88 for (k=ngram.p_states[i].pdf().item_start();
89 !ngram.p_states[i].pdf().item_end(k);
90 k = ngram.p_states[i].pdf().item_next(k))
91 {
92 double freq;
93 EST_String name;
94 ngram.p_states[i].pdf().item_freq(k,name,freq);
95 // Build all the sub-ngrams and accumulate them
96 for (j=0; j < ngram.order()-1; j++)
97 {
99 nnn[j] = name;
100 for (l=0; l < j; l++)
101 nnn[l] = words(ngram.order()-1-j);
102 backoff_ngrams[j].accumulate(nnn,freq);
103 }
104 }
105 }
106
107}
108
109int fs_backoff_smooth(EST_Ngrammar *backoff_ngrams,
110 EST_Ngrammar &ngram, int smooth_thresh)
111{
112 // For all ngrams which are too infrequent, adjust their
113 // frequencies based on their backoff probabilities
114 int i;
115 EST_Litem *j;
116 double occurs;
117 double backoff_prob;
118
119 if (ngram.representation() != EST_Ngrammar::dense)
120 {
121 cerr << "Ngrammar: can only ptsmooth dense ngrammars" << endl;
122 return FALSE;
123 }
124 else
125 {
126 for (i=0; i < ngram.num_states(); i++)
127 {
128 if (ngram.p_states[i].pdf().samples() < smooth_thresh)
129 {
130 EST_DiscreteProbDistribution &pdf = ngram.p_states[i].pdf();
131 occurs = ngram.p_states[i].pdf().samples();
132 EST_StrVector words = ngram.make_ngram_from_index(i);
133 words.resize(words.n()+1);
134
135 for (j=pdf.item_start();
136 ! pdf.item_end(j);
137 j = pdf.item_next(j))
138 {
139 EST_String name;
140 double freq;
141 pdf.item_freq(j,name,freq);
142 words[words.n()-1] = name;
143 backoff_prob =
144 fs_find_backoff_prob(backoff_ngrams,
145 ngram.order()-1,
146 words,
149 }
150 }
151 }
152 }
153
154 return TRUE;
155}
156
157static double fs_find_backoff_prob(EST_Ngrammar *backoff_ngrams,
158 int order,const EST_StrVector words,
159 int smooth_thresh)
160{
161 // Find the backoff prob for n-1gram
163 int i;
164
165 if (order == 0)
166 return TINY_FREQ; // ultimate floor
167
168 nnn.resize(order);
169
170 for(i=0; i<order; i++)
171 nnn[order-1-i] = words(words.n()-1-i);
172
173 if (backoff_ngrams[order-1].frequency(nnn) < smooth_thresh)
174 return fs_find_backoff_prob(backoff_ngrams,
175 order-1,words,smooth_thresh);
176 else
177 return backoff_ngrams[order-1].probability(nnn);
178}
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
EST_Litem * item_start() const
Used for iterating through members of the distribution.
double samples(void) const
Total number of example found.
void set_frequency(const EST_String &s, double c)
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.