48#include "EST_String.h"
49#include "EST_Ngrammar.h"
51#include "EST_cutils.h"
79 if (
ts.open(filename) == -1)
80 return misc_read_error;
83 while ((!
ts.eof()) && !
ts.get().string().contains(
"\\data\\"));
99 if (
ts.peek().string().contains(
"-grams:"))
102 s=
ts.get_upto_eoln().string();
131 if(!n.init(order,EST_Ngrammar::backoff,vocab))
132 return misc_read_error;
135 for(i=1;i<=order;i++)
152 cerr <<
"Unexpected end of grammar file whilst looking for '"
154 return misc_read_error;
164 for (k=0; ((k<i) && !
ts.eof()); k++)
169 cerr <<
"Unexpected end of file whilst reading " << i
170 <<
"-grams !" <<
endl;
171 return misc_read_error;
185 weight =
atof(
ts.get().string());
186 n.set_backoff_weight(
window,weight);
191 cerr <<
"EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
194 return misc_read_error;
203 if (
ts.get().string() ==
"\\end\\")
210 cerr <<
"Missing \\end\\ !" <<
endl;
213 return misc_read_error;
224 if (
ts.open(filename) == -1)
225 return misc_read_error;
227 if (
ts.peek().string() !=
"Ngram_2")
234 order =
atoi(
ts.get().string());
240 vocab.append(
ts.get().string());
243 pred_vocab.
append(
ts.get().string());
245 if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
247 cerr <<
"Something may be wrong with the vocab lists in '"
248 << filename <<
"'" <<
endl;
249 return misc_read_error;
256 for (i=0; i < order; i++)
258 if (
ts.get().string() !=
":")
260 cerr <<
"EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
262 return misc_read_error;
268 cerr <<
"EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
270 return misc_read_error;
292 if ((
ifd=
fopen(filename,
"rb")) == NULL)
293 return misc_read_error;
294 fread(&magic,
sizeof(
int),1,
ifd);
296 if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
298 else if (magic != EST_NGRAMBIN_MAGIC)
300 if (
ts.open(
ifd, FALSE) == -1)
301 return misc_read_error;
303 ts.set_SingleCharSymbols(
"\n");
304 ts.set_WhiteSpaceChars(
" \t\r");
306 if (
ts.peek().string() !=
"mBin_2")
314 order =
atoi(
ts.get().string());
315 if (
ts.get() !=
"\n")
319 return misc_read_error;
324 while ((
ts.peek() !=
"\n") && (!
ts.eof()))
325 vocab.append(
ts.get().string());
327 while ((
ts.peek() !=
"\n") && (!
ts.eof()))
328 pred_vocab.
append(
ts.get().string());
332 fseek(
ifd,(
long)(
ts.peek().filepos()+5),SEEK_SET);
334 if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
338 return misc_read_error;
347 double *
dd =
new double[num_entries];
352 if (fread(
dd,
sizeof(
double),num_entries,
ifd) != (
unsigned)num_entries)
354 cerr <<
"EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" <<
endl;
357 return misc_read_error;
360 swap_bytes_double(
dd,num_entries);
362 for(
j=i=0;i<n.num_states();i++)
364 if (
j >= num_entries)
366 cerr <<
"EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" <<
endl;
369 return misc_read_error;
372 (!n.p_states[i].pdf().
item_end(k)) && (
j < num_entries) ;
382 if (
j+1 >= num_entries)
384 else if (
dd[
j+1] < -1)
386 else if (
dd[
j+1] == -1)
424 if (
word == n.p_sentence_end_marker)
427 *
ost <<
" 0*" << n.pred_vocab->
length()-1 <<
" " << 1 <<
endl;
433 cerr <<
"ERROR : floor is impossibly large, scaling it !" <<
endl;
444 if(name != n.p_sentence_start_marker)
469 if ( (name == n.p_sentence_start_marker) ||
470 (name == n.p_sentence_end_marker) ||
471 (name == OOV_MARKER) )
505 if(!n.closed_vocab())
510 *
ost << 0 <<
" ERROR !!!!!!!! ";
516 freq =
this_pdf.frequency(n.p_sentence_end_marker);
548save_ngram_htk_ascii(
const EST_String filename,
557 cerr <<
"Can only save bigrams in htk_ascii format" <<
endl;
558 return misc_write_error;
563 cerr <<
"Negative floor probability does not make sense !" <<
endl;
564 return misc_write_error;
578 cerr <<
"ERROR : floor is impossibly large, scaling it to ";
584 if(n.p_sentence_start_marker ==
"")
586 cerr <<
"Can't save in HTK format as no sentence start/end tags"
587 <<
" were given !" <<
endl;
588 return misc_write_error;
592 save_ngram_htk_ascii_sub(n.p_sentence_start_marker,
ost,n,
floor);
595 for(i=0;i<n.vocab->
length();i++)
597 if ( (n.vocab->
name(i) != n.p_sentence_start_marker) &&
598 (n.vocab->
name(i) != n.p_sentence_end_marker) &&
599 (n.vocab->
name(i) != OOV_MARKER) )
603 if(!n.closed_vocab())
604 save_ngram_htk_ascii_sub(OOV_MARKER,
ost,n,
floor);
606 save_ngram_htk_ascii_sub(n.p_sentence_end_marker,
ost,n,
floor);
625 if(n->ngram_exists(ngram))
626 *((
double*)count) += 1;
635 if(n->ngram_exists(ngram))
637 *((
ostream*)(
ost)) << safe_log10(n->probability(ngram)) <<
" ";
638 for(i=0;i<ngram.
n();i++)
641 if ((n->representation() == EST_Ngrammar::backoff) &&
642 (n->order() > ngram.
n()) )
643 *((
ostream*)(
ost)) << safe_log10(n->get_backoff_weight(ngram));
674 double *count =
new double;
676 if (n.representation() == EST_Ngrammar::backoff)
678 for(
o=1;
o<=n.order();
o++)
687 n.iterate(ngram,&count_ngram_arpa_sub,(
void*)count);
688 *
ost <<
"ngram " <<
o <<
"=" << *count <<
endl;
691 for(
o=1;
o<=n.order();
o++)
694 *
ost <<
"\\" <<
o <<
"-grams:" <<
endl;
698 n.iterate(ngram,&save_ngram_arpa_sub,(
void*)
ost);
705 for(i=0;i<n.order();i++)
708 n.iterate(ngram,&count_ngram_arpa_sub,(
void*)count);
709 *
ost <<
"ngram " << n.order() <<
"=" << *count <<
endl;
712 *
ost <<
"\\" << n.order() <<
"-grams:" <<
endl;
714 for(i=0;i<n.order();i++)
716 n.iterate(ngram,&save_ngram_arpa_sub,
ost);
730 const bool trace,
double floor)
746 *
ost <<
"Ngram_2 " << n.order() <<
endl;
747 for (i=0; i < n.vocab->
length(); i++)
748 *
ost << n.vocab->
name(i) <<
" ";
750 for (i=0; i < n.pred_vocab->
length(); i++)
751 *
ost << n.pred_vocab->
name(i) <<
" ";
754 if (n.representation() == EST_Ngrammar::dense)
756 else if (n.representation() == EST_Ngrammar::backoff)
776 *
ost << name <<
" : " << freq <<
endl;
794 if ((
ost =
fopen(filename,
"wb")) == NULL)
796 cerr <<
"Ngrammar save: unable to open \"" << filename <<
797 "\" for writing" <<
endl;
804 for (i=0; i < n.vocab->
length(); i++)
808 for (i=0; i < n.vocab->
length(); i++)
814 for (i=0; i<n.num_states(); i++)
827 const bool trace,
double floor)
830 if (n.representation() == EST_Ngrammar::sparse)
831 return misc_write_error;
838 int magic = EST_NGRAMBIN_MAGIC;
843 return misc_write_error;
847 if ((
ofd=
fopen(filename,
"wb")) == NULL)
848 return misc_write_error;
853 for (i=0; i < n.vocab->
length(); i++)
856 for (i=0; i < n.pred_vocab->
length(); i++)
867 if (n.representation() == EST_Ngrammar::dense)
869 for(i=0;i<n.num_states();i++)
873 cerr <<
"\r" << i*100/n.num_states() <<
"%";
881 n.p_states[i].pdf().
item_freq(k,name,freq);
899 else if (n.representation() == EST_Ngrammar::backoff)
EST_Litem * item_next(EST_Litem *idx) const
Used for iterating through members of the distribution.
void item_freq(EST_Litem *idx, EST_String &s, double &freq) const
During iteration returns name and frequency given index
EST_Litem * item_start() const
Used for iterating through members of the distribution.
void cumulate(const EST_String &s, double count=1)
Add this observation, may specify number of occurrences.
void set_frequency(const EST_String &s, double c)
int item_end(EST_Litem *idx) const
Used for iterating through members of the distribution.
const EST_String & name(const int n) const
The name given the index.
const int length(void) const
The number of members in the discrete.
EST_String before(int pos, int len=0) const
Part before position.
int contains(const char *s, int pos=-1) const
Does it contain this substring?
EST_String after(int pos, int len=1) const
Part after pos+len.
void append(const T &item)
add item onto end of list
INLINE int n() const
number of items in vector.