49#include "EST_Ngrammar.h"
50#include "EST_Pathname.h"
52#include "EST_io_aux.h"
63 init(s.id(),s.pdf_const());
69 init(s->id(),s->pdf_const());
72EST_NgrammarState::~EST_NgrammarState()
77void EST_NgrammarState::clear()
83void EST_NgrammarState::init()
105 s <<
"(" << a.id() <<
": " << a.pdf_const() <<
" )";
111EST_BackoffNgrammarState::~EST_BackoffNgrammarState()
118void EST_BackoffNgrammarState::clear()
124void EST_BackoffNgrammarState::init()
130void EST_BackoffNgrammarState::init(
const EST_Discrete *d,
int level)
164 if (
words.n()-1-p_level > 0)
174 return s->accumulate(
words,count);
207 if (
words.n()-1-p_level > 0)
222 cerr <<
"Failed to extend tree - unknown reason !" <<
endl;
225 return s->accumulate(
words,count);
235EST_BackoffNgrammarState::add_child(
const EST_Discrete *d,
240 if (
words.n()-1-p_level > 0)
245 return s->add_child(d,
words);
265EST_BackoffNgrammarState::add_child(
const EST_Discrete *d,
270 if (
words.n()-1-p_level > 0)
275 return s->add_child(d,
words);
298 children.
add(name,NULL);
302void EST_BackoffNgrammarState::print_freqs(
ostream &
os,
318 if (p_level==order-1)
322 <<
": " << freq <<
endl;
331 const double threshold)
const
334 s = get_state(
words);
339 return (
bool)((s->level()==0) ||
340 ( s->frequency(
words(0)) > threshold ));
350 if (
words.n()-1-p_level > 0)
357 return s->get_state(
words);
372void EST_BackoffNgrammarState::zap()
386 remove_child(
child,name);
395const double EST_BackoffNgrammarState::get_backoff_weight(
const EST_StrVector &
words)
const
398 if (
words.n()-1-p_level >= 0)
402 return s->get_backoff_weight(
words);
421 return backoff_weight;
426bool EST_BackoffNgrammarState::set_backoff_weight(
const EST_StrVector &
words,
const double w)
429 if (
words.n()-1-p_level >= 0)
433 return s->set_backoff_weight(
words,w);
441 cerr <<
"Couldn't set weight for " <<
words
442 <<
" to " << w <<
endl;
457void EST_BackoffNgrammarState::frequency_of_frequencies(
EST_DVector &
ff)
469 ff[(int)(freq+0.5)] += 1;
475 s <<
"(backoff level:" << a.p_level
476 <<
" weight:" << a.backoff_weight <<
" " << a.pdf_const() <<
" )";
483void EST_Ngrammar::default_values()
485 p_representation = EST_Ngrammar::sparse;
486 p_entry_type = EST_Ngrammar::frequencies;
487 sparse_representation.clear();
494 backoff_threshold = 1.0;
495 backoff_unigram_floor_freq = 0.0;
498EST_Ngrammar::~EST_Ngrammar()
503void EST_Ngrammar::clear()
508bool EST_Ngrammar::init(
int o, EST_Ngrammar::representation_t r,
511 return (
bool)(init_vocab(
wordlist) && p_init(
o,r));
514bool EST_Ngrammar::init(
int o, EST_Ngrammar::representation_t r,
521bool EST_Ngrammar::init(
int o, EST_Ngrammar::representation_t r,
526 vocab_pdf.
init(pred_vocab);
530bool EST_Ngrammar::init(
int o, EST_Ngrammar::representation_t r,
535 vocab_pdf.
init(pred_vocab);
539bool EST_Ngrammar::p_init(
int o, EST_Ngrammar::representation_t r)
543 cerr <<
"EST_Ngrammar order must be > 0" <<
endl;
548 p_representation = r;
549 p_number_of_sentences = 0;
551 switch(p_representation)
554 case EST_Ngrammar::sparse:
555 sparse_representation.init(p_order);
559 case EST_Ngrammar::dense:
560 return init_dense_representation();
563 case EST_Ngrammar::backoff:
564 return init_backoff_representation();
568 cerr <<
"Unknown internal representation requested for EST_Ngrammar"
575bool EST_Ngrammar::init_dense_representation()
582 cerr <<
"EST_Ngrammar: dense_representation requires explicit vocab"
589 for (i=0; i < p_num_states; i++)
590 p_states[i].init(i,pred_vocab);
595bool EST_Ngrammar::init_sparse_representation()
600 cerr <<
"EST_Ngrammar: dense_representation requires explicit vocab"
608 return (
bool)(p_states != NULL);
611bool EST_Ngrammar::init_backoff_representation()
616 backoff_representation->init(vocab,0);
621const EST_StrVector &EST_Ngrammar::make_ngram_from_index(
const int index)
const
630 for(i=p_order-2;i>=0;i--)
632#if defined(sun) && ! defined(__svr4__)
636 (*ngram)[i] = wordlist_index(
rem);
640 (*ngram)[i] = wordlist_index(d.rem);
658 vocab_pdf.
init(pred_vocab);
660 return (
bool)(vocab != NULL);
672 vocab_pdf.
init(pred_vocab);
674 return (
bool)(vocab != NULL);
697const EST_String & EST_Ngrammar::wordlist_index(
int i)
const
699 return vocab->
name(i);
713 cerr <<
"Word \"" <<
word <<
"\" is not in the predictee word list" <<
endl;
717 i = pred_vocab->
index(OOV_MARKER);
721 cerr <<
"Even " << OOV_MARKER <<
" is not in the predictee word list !" <<
endl;
728const EST_String & EST_Ngrammar::predlist_index(
int i)
const
730 return pred_vocab->
name(i);
745 cerr <<
"Word \"" <<
word <<
"\" is not in the word list" <<
endl;
749 i = vocab->
index(OOV_MARKER);
753 cerr <<
"Even " << OOV_MARKER <<
" is not in the word list !" <<
endl;
769 p_sentence_start_marker = prev;
770 p_sentence_end_marker = last;
774 if( (p_representation == EST_Ngrammar::backoff) &&
777 cerr <<
"Warning : building a backoff grammar" <<
endl
779 <<
"' is not recommended !" <<
endl;
790 if( (
oov_mode ==
"skip_sentence") &&
793 cerr <<
"Sorry, with input format 'ngram_per_line' you cannot " <<
endl
794 <<
" select oov_mode 'skip_sentence'" <<
endl;
806 for (p =
filenames.head(); p; p = p->next())
811 if( ((
oov_mode ==
"skip_sentence") &&
817 else if( ((
oov_mode ==
"skip_sentence") &&
830 <<
" (out of vocabulary words found)" <<
endl;
835 switch(p_representation)
837 case EST_Ngrammar::sparse:
850 case EST_Ngrammar::dense:
855 case EST_Ngrammar::backoff:
861 cerr <<
"Unknown internal representation set for EST_Ngrammar"
872 cerr <<
"Warning : couldn't remove temporary file : "
878 if (p_representation == EST_Ngrammar::backoff)
887 if (
words.n() < p_order)
888 cerr <<
"EST_Ngrammar::accumulate - window is too small" <<
endl;
895 switch(p_representation)
897 case EST_Ngrammar::dense:
898 case EST_Ngrammar::sparse:
899 find_state(
words).cumulate(w,count);
902 case EST_Ngrammar::backoff:
903 backoff_representation->accumulate(
words,count);
907 cerr <<
"EST_Ngrammar::accumulate : invalid representation !"
928 if (
words.n() < p_order)
929 cerr <<
"EST_Ngrammar::accumulate - window is too small" <<
endl;
935 switch(p_representation)
938 case EST_Ngrammar::dense:
939 case EST_Ngrammar::sparse:
940 find_state(
words).cumulate(
words(p_order-1),count);
943 case EST_Ngrammar::backoff:
944 backoff_representation->accumulate(
words,count);
948 cerr <<
"EST_Ngrammar::accumulate : invalid representation !"
958 switch(p_representation)
960 case EST_Ngrammar::sparse:
964 case EST_Ngrammar::dense:
968 case EST_Ngrammar::backoff:
971 return backoff_representation->ngram_exists(
words,0);
973 return backoff_representation->ngram_exists(
words,backoff_threshold);
978 cerr <<
"ngram_exists: unknown ngrammar representation" <<
endl;
984bool EST_Ngrammar::ngram_exists(
const EST_StrVector &
words,
const double threshold)
const
986 if (p_representation != EST_Ngrammar::backoff)
988 cerr <<
"Not a backoff grammar !" <<
endl;
992 return backoff_representation->ngram_exists(
words,threshold);
999 if(p_representation == EST_Ngrammar::backoff)
1000 return backoff_representation->get_backoff_weight(
words);
1003 cerr <<
"Can't get backoff weight - not a backed off ngrammar !" <<
endl;
1010 if(p_representation == EST_Ngrammar::backoff)
1011 return backoff_representation->set_backoff_weight(
words,w);
1014 cerr <<
"Can't set backoff weight - not a backed off ngrammar !" <<
endl;
1020bool EST_Ngrammar::build_sparse(
const EST_String &filename,
1025 sparse_representation.build(filename,prev,
prev_prev,last);
1036 for (i=0; i<
window.n()-1; i++)
1038 window[i++] = wordlist_index(prev);
1047 for (i=0; i<
window.n()-1; i++)
1052bool EST_Ngrammar::oov_preprocess(
const EST_String &filename,
1068 if( (
what ==
"eliminate lines") || (filename ==
"-") )
1072 if (filename ==
"-")
1074 if(
ts.open(
stdin, FALSE) == -1)
1076 cerr <<
"EST_Ngrammar:: failed to open stdin";
1081 else if (
ts.open(filename) == -1){
1082 cerr <<
"EST_Ngrammar: failed to open file \"" << filename
1083 <<
"\" for reading" <<
endl;
1095 cerr <<
"Ngrammar: couldn't create temporary file \""
1108 s=
ts.get().string();
1112 if(wordlist_index(s,
false) < 0)
1115 if(
what ==
"eliminate lines")
1126 cerr <<
"Warning : couldn't delete temporary file '"
1164bool EST_Ngrammar::build_ngram(
const EST_String &filename,
1170 p_entry_type = EST_Ngrammar::frequencies;
1189 p_number_of_sentences = 1;
1195 p_number_of_sentences = 1;
1205 if (filename ==
"-")
1207 if(
ts.open(
stdin, FALSE) == -1)
1209 cerr <<
"EST_Ngrammar:: failed to open stdin";
1214 else if (
ts.open(filename) == -1){
1215 cerr <<
"EST_Ngrammar: failed to open \"" << filename
1216 <<
"\" for reading" <<
endl;
1233 if (
window(p_order-1) == -1)
1235 else if( (p_order>1) && (
window(p_order-2) == -1))
1246 s=
ts.get().string();
1256 window[p_order-1] = wordlist_index(s);
1257 if (
window(p_order-1) < 0)
1259 cerr <<
"EST_Ngrammar::build_ngram " <<
1260 " word \"" << s <<
"\" is not in vocabulary, skipping"
1278 if(count == p_order-1)
1279 window[count++] = predlist_index(s);
1281 window[count++] = wordlist_index(s);
1285 cerr <<
"EST_Ngrammar::build_ngram " <<
1286 " word \"" << s <<
"\" is not in vocabulary, skipping"
1292 cerr <<
"Too many items on line - ignoring trailing ones !" <<
endl;
1302 if((count == p_order) &&
bad_word == 0)
1310 if (
window(p_order-1) != wordlist_index(last))
1311 p_number_of_sentences += 1;
1314 window[p_order-1] = wordlist_index(last);
1316 if(
window(p_order-1) == -1)
1329 if (
window(p_order-1) == -1)
1331 else if( (p_order>1) && (
window(p_order-2) == -1) )
1346 window[p_order-1] = wordlist_index(last);
1348 if (
window(p_order-1) == -1)
1354 p_number_of_sentences += 1;
1360 cerr <<
"Accumulated " << p_number_of_sentences <<
" sentences." <<
endl;
1377 cerr <<
"computing backoff w for ";
1394 cerr <<
"WARNING : couldn't set weight !" <<
endl;
1405 for(
j=0;
j<n->get_pred_vocab_length();
j++)
1407 ngram[ngram.
n()-1] = n->get_pred_vocab_word(
j);
1409 for(i=0;i<ngram.
n();i++)
1410 cerr << ngram(i) <<
" ";
1412 if (n->ngram_exists(ngram))
1414 cerr << n->probability(ngram) <<
" exists " <<
endl;
1416 sum1 += n->probability(ngram);
1429 cerr <<
" unseen, P(";
1443 cerr <<
"WARNING : couldn't set weight !" <<
endl;
1450 cerr <<
"NEGATIVE WEIGHT for ";
1458 for(
j=0;
j<n->get_pred_vocab_length();
j++)
1460 ngram[ngram.
n()-1] = n->get_pred_vocab_word(
j);
1463 if (n->ngram_exists(ngram))
1466 for(i=0;i<ngram.
n();i++)
1467 cerr << ngram(i) <<
" ";
1468 cerr <<
" exists, prob = ";
1469 cerr << n->probability(ngram,
false,
true) <<
endl;
1478 cerr <<
"WARNING : couldn't set weight !" <<
endl;
1485 ngram[ngram.
n()-1] =
tmp;
1489bool EST_Ngrammar::compute_backoff_weights(
const int mincount,
1504 backoff_restore_unigram_states();
1506 Good_Turing_discount(*
this,
maxcount,0.5);
1534 for (
o=2;
o<=order();
o++)
1537 cerr <<
"Backing off order " <<
o <<
endl;
1548 iterate(
words,&compute_backoff_weight,NULL);
1558void EST_Ngrammar::backoff_restore_unigram_states()
1568 words[0] =
"wibble";
1569 for(
j=0;
j<get_pred_vocab_length();
j++)
1571 words[1] = get_pred_vocab_word(
j);
1572 backoff_representation->accumulate(
words,0);
1581 if (start_state == NULL)
1582 start_state = backoff_representation;
1592 for (k=start_state->pdf_const().item_start();
1593 !start_state->pdf_const().item_end(k);
1594 k = start_state->pdf_const().item_next(k))
1596 start_state->pdf_const().item_freq(k,name,freq);
1597 if (freq < TINY_FREQ)
1605 start_state->remove_child(
child,name);
1612 for (k=start_state->pdf_const().item_start();
1613 !start_state->pdf_const().item_end(k);
1614 k = start_state->pdf_const().item_next(k))
1616 start_state->pdf_const().item_freq(k,name,freq);
1623 prune_backoff_representation(
child);
1631 switch(n.p_representation)
1633 case EST_Ngrammar::sparse:
1634 n.sparse_representation.print_freqs(s);
1637 case EST_Ngrammar::dense:
1638 s <<
"Dense" <<
endl;
1642 case EST_Ngrammar::backoff:
1643 s <<
"Backoff" <<
endl;
1644 s << *(n.backoff_representation) <<
endl;
1648 cerr <<
"Unknown internal representation of EST_Ngrammar : can't print"
1657EST_Ngrammar::set_entry_type(EST_Ngrammar::entry_t
new_type)
1664 cerr <<
"Couldn't do entry type conversion !" <<
endl;
1668bool EST_Ngrammar::sparse_to_dense()
1670 cerr <<
"EST_Ngrammar::sparse_to_dense() "
1671 <<
" not implemented" <<
endl;
1675bool EST_Ngrammar::dense_to_sparse()
1677 cerr <<
"EST_Ngrammar::dense_to_sparse()"
1678 <<
" not implemented" <<
endl;
1687 for(i=0;i<p_order-1;i++)
1690 wa =
words.a_no_check(i+index);
1699int EST_Ngrammar::find_next_state_id(
int state,
int word)
const
1706 for (f=1,i=0; i<p_order-2; i++)
1713 switch(p_representation)
1715 case EST_Ngrammar::sparse:
1720 case EST_Ngrammar::dense:
1724 for(i=0;i<p_order-1;i++)
1727 if (
tmp(i) == -1)
break;
1730 if (
tmp(i) == -1)
break;
1731 return p_states[find_dense_state_index(
tmp)];
1735 case EST_Ngrammar::backoff:
1736 cerr <<
"find_state: not valid in backoff mode !" <<
endl;
1740 cerr <<
"find_state: unknown ngrammar representation" <<
endl;
1751 switch(p_representation)
1753 case EST_Ngrammar::sparse:
1758 case EST_Ngrammar::dense:
1762 for(i=0;i<p_order-1;i++)
1765 if (
tmp(i) == -1)
break;
1768 if (
tmp(i) == -1)
break;
1769 return p_states[find_dense_state_index(
tmp)];
1773 case EST_Ngrammar::backoff:
1774 cerr <<
"find_state_const: not valid in backoff mode !" <<
endl;
1778 cerr <<
"find_state: unknown ngrammar representation" <<
endl;
1787 switch(p_representation)
1789 case EST_Ngrammar::sparse:
1794 case EST_Ngrammar::dense:
1795 return p_states[find_dense_state_index(
words)];
1798 case EST_Ngrammar::backoff:
1799 cerr <<
"find_state: not valid in backoff mode !" <<
endl;
1803 cerr <<
"find_state: unknown ngrammar representation" <<
endl;
1813 switch(p_representation)
1815 case EST_Ngrammar::sparse:
1819 case EST_Ngrammar::dense:
1820 return p_states[find_dense_state_index(
words)];
1823 case EST_Ngrammar::backoff:
1824 cerr <<
"find_state_const: not valid in backoff mode !" <<
endl;
1828 cerr <<
"find_state: unknown ngrammar representation" <<
endl;
1836bool EST_Ngrammar::set_representation(EST_Ngrammar::representation_t
new_representation)
1843 return sparse_to_dense();
1845 return dense_to_sparse();
1848 cerr <<
"set_representation: unknown ngrammar representation" <<
endl;
1858 switch(p_representation)
1860 case EST_Ngrammar::sparse:
1861 case EST_Ngrammar::dense:
1862 return find_state_const(
words).probability(lastword(
words));
1865 case EST_Ngrammar::backoff:
1866 return backoff_probability(
words,trace);
1870 cerr <<
"probability: unknown ngrammar representation" <<
endl;
1881 switch(p_representation)
1883 case EST_Ngrammar::sparse:
1884 case EST_Ngrammar::dense:
1885 return find_state_const(
words).frequency(lastword(
words));
1888 case EST_Ngrammar::backoff:
1889 return backoff_probability(
words,trace);
1893 cerr <<
"probability: unknown ngrammar representation" <<
endl;
1900 double *prob,
int *state)
const
1904 switch(p_representation)
1906 case EST_Ngrammar::sparse:
1907 case EST_Ngrammar::dense:
1911 return s.most_probable(prob);
1915 case EST_Ngrammar::backoff:
1917 return backoff_most_probable(
words,prob);
1921 cerr <<
"probability: unknown ngrammar representation" <<
endl;
1928 double *prob,
int *state)
const
1932 switch(p_representation)
1934 case EST_Ngrammar::sparse:
1935 case EST_Ngrammar::dense:
1939 return s.most_probable(prob);
1943 case EST_Ngrammar::backoff:
1944 cerr <<
"probability: IVector access to backoff not supported" <<
endl;
1949 cerr <<
"probability: unknown ngrammar representation" <<
endl;
1957 switch(p_representation)
1959 case EST_Ngrammar::sparse:
1960 case EST_Ngrammar::dense:
1966 cerr <<
"Ngrammar: representation doesn't support states" <<
endl;
1974 switch(p_representation)
1976 case EST_Ngrammar::sparse:
1977 case EST_Ngrammar::dense:
1983 cerr <<
"Ngrammar: representation doesn't support states" <<
endl;
1989EST_String EST_Ngrammar::get_vocab_word(
int i)
const
1992 return vocab->
name(i);
1997int EST_Ngrammar::get_vocab_word(
const EST_String &s)
const
1999 int index = vocab->
name(s);
2009 switch(p_representation)
2011 case EST_Ngrammar::sparse:
2012 case EST_Ngrammar::dense:
2016 return s.frequency(lastword(
words))/
2017 vocab_pdf.frequency(lastword(
words));
2021 case EST_Ngrammar::backoff:
2022 return backoff_reverse_probability(
words);
2026 cerr <<
"probability: unknown ngrammar representation" <<
endl;
2038 switch(p_representation)
2040 case EST_Ngrammar::sparse:
2041 case EST_Ngrammar::dense:
2045 return s.frequency(lastword(
words))/
2046 vocab_pdf.frequency(lastword(
words));
2050 case EST_Ngrammar::backoff:
2051 cerr <<
"probability: reverse prob unavailable for backoff ngram"
2057 cerr <<
"probability: unknown ngrammar representation" <<
endl;
2064EST_Ngrammar::prob_dist(
int state)
const
2066 return p_states[state].pdf_const();
2073 switch(p_representation)
2075 case EST_Ngrammar::sparse:
2076 case EST_Ngrammar::dense:
2079 return s.pdf_const();
2083 case EST_Ngrammar::backoff:
2084 return backoff_prob_dist(
words);
2088 cerr <<
"probability: unknown ngrammar representation" <<
endl;
2089 return PSTnullProbDistribution;
2098 switch(p_representation)
2100 case EST_Ngrammar::sparse:
2101 case EST_Ngrammar::dense:
2104 return s.pdf_const();
2108 case EST_Ngrammar::backoff:
2109 cerr <<
"probability: unsupport IVector access of backoff ngram" <<
endl;
2110 return PSTnullProbDistribution;
2114 cerr <<
"probability: unknown ngrammar representation" <<
endl;
2115 return PSTnullProbDistribution;
2121EST_Ngrammar::load(
const EST_String &filename)
2124 EST_read_status
r_val;
2130 if ((
r_val = load_ngram_cstr_ascii(filename, *
this)) != wrong_format)
2132 if ((
r_val = load_ngram_cstr_bin(filename, *
this)) != wrong_format)
2140 if (
fname.extension() == GZIP_FILENAME_EXTENSION)
2141 tmp_fname = uncompress_file_to_temporary(filename,
2142 "gzip --decompress --stdout");
2143 else if (
fname.extension() == COMPRESS_FILENAME_EXTENSION)
2144 tmp_fname = uncompress_file_to_temporary(filename,
"uncompress -c");
2153 return misc_read_error;
2155 cerr <<
"EST_Ngrammar::load can't determine ngrammar file type for input file " << filename <<
endl;
2156 return wrong_format;
2166 EST_read_status
r_val;
2168 if ((
r_val = load_ngram_arpa(filename, *
this,
wordlist)) != wrong_format)
2176 if ((
r_val = load_ngram_cstr_ascii(filename, *
this)) != wrong_format)
2182 cerr <<
"Wordlist file does not match grammar wordlist !" <<
endl;
2183 return misc_read_error;
2187 if ((
r_val = load_ngram_cstr_bin(filename, *
this)) != wrong_format)
2193 cerr <<
"Wordlist does not match grammar !" <<
endl;
2194 return misc_read_error;
2199 cerr <<
"EST_Ngrammar::load can't determine ngrammar file type for input file " << filename <<
endl;
2200 return wrong_format;
2205EST_Ngrammar::make_htk_compatible()
2208 cerr <<
"EST_Ngrammar::make_htk_compatible() not written yet." <<
endl;
2214 const bool trace,
double floor)
2218 return save(filename,
"cstr_ascii",
false,
floor);
2219 if (type ==
"htk_ascii")
2220 return save_ngram_htk_ascii(filename, *
this,
floor);
2224 return save_ngram_arpa(filename, *
this);
2225 if (type ==
"cstr_ascii")
2226 return save_ngram_cstr_ascii(filename, *
this,trace,
floor);
2227 if (type ==
"cstr_bin")
2228 return save_ngram_cstr_bin(filename, *
this, trace,
floor);
2230 return save_ngram_wfst(filename, *
this);
2232 cerr <<
"EST_Ngrammar::save unknown output file type " << type <<
endl;
2246 for(i=0;i<
words.n();i++)
2265 for(i=0;i<pred_vocab->
length();i++){
2273 for(i=0;i<vocab->
length();i++){
2292 for(i=0;i<
words.n();i++)
2311 for(i=0;i<pred_vocab->
length();i++){
2319 for(i=0;i<vocab->
length();i++){
2331 if (p_representation == EST_Ngrammar::backoff)
2332 backoff_representation->print_freqs(
os,p_order);
2339 for (i=0; i < p_num_states; i++)
2342 for (k=p_states[i].pdf().item_start();
2343 !p_states[i].pdf().item_end(k);
2344 k = p_states[i].pdf().item_next(k))
2349 p_states[i].pdf().
item_freq(k,name,freq);
2354 for (
j = p_order-2;
j >= 0;
j--)
2359 for (
j = 0;
j < p_order-1;
j++)
2361 os << name <<
" : " << freq <<
endl;
2378 for(i=0;i<
words.n();i++)
2379 ngram[i] =
words(i);
2383 for(
j=0;
j<get_pred_vocab_length();
j++)
2385 ngram[ngram.
n()-1] = get_pred_vocab_word(
j);
2386 double tmp = backoff_probability(ngram,
false);
2394const double EST_Ngrammar::get_backoff_discount(
const int order,
const double freq)
const
2398 cerr <<
"order too great in EST_Ngrammar::get_backoff_discount" <<
endl;
2402 else if( (
int)freq < backoff_discount[order-1].n())
2403 return backoff_discount[order-1]((int)freq);
2410 const bool trace)
const
2420 cerr <<
"backoff_probability( ";
2421 for(i=0;i<
words.n();i++)
2430 cerr <<
"unigram " << backoff_representation->probability(
words(0))
2433 f=backoff_representation->frequency(
words(0));
2438 return f / backoff_representation->pdf_const().samples();
2440 return backoff_unigram_floor_freq / backoff_representation->pdf_const().samples();
2450 state=backoff_representation->get_state(
words);
2452 if( (state != NULL) &&
2453 ((f=state->frequency(
words(0))) > backoff_threshold) )
2464 if((
new_ngram(0) == p_sentence_start_marker) ||
2465 (
new_ngram(0) == p_sentence_end_marker) )
2467 f2 = p_number_of_sentences;
2469 cerr <<
"special freq used : " << f2 <<
endl;
2473 state=backoff_representation->get_state(
new_ngram);
2476 cerr <<
"Something went horribly wrong !" <<
endl;
2494 cerr <<
" ..... got (" << f <<
" - "
2495 << get_backoff_discount(state->level()+1,f)
2496 <<
")/" << f2 <<
" = "
2497 << (f - get_backoff_discount(state->level()+1,f) ) / f2
2500 return (f - get_backoff_discount(state->level()+1,f) ) / f2;
2513 cerr <<
"backed off(" <<
bo_wt <<
") to (";
2554 return root->probability(
words(0));
2564 state=root->get_state(
words);
2567 if( (state != NULL) &&
2568 ((f=state->frequency(
words(0))) > 0) )
2574 cerr <<
"Something went horribly wrong !" <<
endl;
2580 return f / state->frequency(
new_ngram(0));
2613 state = backoff_representation->get_child(
words(
words.n()-1));
2623 return backoff_reverse_probability_sub(
words,state);
2631 return backoff_prob_dist(
words).most_probable(prob);
2647 for(i=0;i<v.
n()+l;i++)
2658 for(i=v.
n()-1;i>=l;i--)
2680 for (k=start_state->pdf_const().item_start();
2681 !start_state->pdf_const().item_end(k);
2682 k = start_state->pdf_const().item_next(k))
2684 start_state->pdf_const().item_freq(k,name,freq);
2700 if (start_state->level() == level)
2704 else if (start_state->level() < level)
2712 for (k=start_state->pdf_const().item_start();
2713 !start_state->pdf_const().item_end(k);
2714 k = start_state->pdf_const().item_next(k))
2716 start_state->pdf_const().item_freq(k,name,freq);
2731 float *weight = (
float*)((
void**)
params)[1];
2733 if(
other_n->ngram_exists(ngram))
2734 n->accumulate(ngram,*weight *
other_n->frequency(ngram));
2742 words.resize(p_order);
2744 void **
params =
new void*[2];
2746 params[1] = (
void*)&weight;
2748 iterate(
words,&merge_other_grammar,(
void*)
params);
2767 for(i=0;i<v.
n()+l;i++)
2778 for(i=v.
n()-1;i>=l;i--)
const EST_Discrete *const get_discrete() const
Returns discrete vocabulary of distribution.
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
bool init(const EST_StrList &vocab)
Initialise using given vocabulary.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
void set_num_samples(const double c)
EST_Litem * item_start() const
Used for iterating through members of the distribution.
void clear(void)
Reset, clearing all counts and vocabulary.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
void set_frequency(const EST_String &s, double c)
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
bool init(const EST_StrList &vocab)
(re-)initialise
const int index(const EST_String &n) const
const EST_String & name(const int n) const
The name given the index.
const int length(void) const
The number of members in the discrete.
void add(const EST_String &key, void *item)
Add {\tt item} indexed by {\tt key}, overwriting previous contents.
void * lookup(const EST_String &key) const
Find contents index by {\tt key}, 0 if there is not contents.
void clear(void)
Delete the tree.
static const EST_String Empty
Constant empty string.
void resize(int n, int set=1)
INLINE int n() const
number of items in vector.