[Julius-cvs 428] CVS update: julius4/libsent/src/ngram

アーカイブの一覧に戻る

sumom****@users***** sumom****@users*****
2009年 7月 4日 (土) 23:11:43 JST


Index: julius4/libsent/src/ngram/ngram_malloc.c
diff -u julius4/libsent/src/ngram/ngram_malloc.c:1.3 julius4/libsent/src/ngram/ngram_malloc.c:1.4
--- julius4/libsent/src/ngram/ngram_malloc.c:1.3	Tue Feb 10 02:27:46 2009
+++ julius4/libsent/src/ngram/ngram_malloc.c	Sat Jul  4 23:11:43 2009
@@ -12,7 +12,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 16:48:56 2005
  *
- * $Revision: 1.3 $
+ * $Revision: 1.4 $
  * 
  */
 /*
@@ -36,6 +36,8 @@
   NGRAM_INFO *new;
 
   new = (NGRAM_INFO *)mymalloc(sizeof(NGRAM_INFO));
+  new->n = 0;
+  new->d = NULL;
   new->bo_wt_1 = NULL;
   new->p_2 = NULL;
   new->bos_eos_swap = FALSE;
@@ -84,8 +86,11 @@
   if (ndata->bo_wt_1) free(ndata->bo_wt_1);
   if (ndata->p_2) free(ndata->p_2);
   /* free n-gram */
-  for(i=0;i<ndata->n;i++) {
-    free_ngram_tuple(&(ndata->d[i]));
+  if (ndata->d) {
+    for(i=0;i<ndata->n;i++) {
+      free_ngram_tuple(&(ndata->d[i]));
+    }
+    free(ndata->d);
   }
   /* free name index tree */
   free_ptree(ndata->root);
Index: julius4/libsent/src/ngram/ngram_read_arpa.c
diff -u julius4/libsent/src/ngram/ngram_read_arpa.c:1.15 julius4/libsent/src/ngram/ngram_read_arpa.c:1.16
--- julius4/libsent/src/ngram/ngram_read_arpa.c:1.15	Tue Feb 10 17:15:48 2009
+++ julius4/libsent/src/ngram/ngram_read_arpa.c	Sat Jul  4 23:11:43 2009
@@ -20,7 +20,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 16:52:24 2005
  *
- * $Revision: 1.15 $
+ * $Revision: 1.16 $
  * 
  */
 /*
@@ -30,7 +30,7 @@
  * All rights reserved
  */
 
-/* $Id: ngram_read_arpa.c,v 1.15 2009/02/10 08:15:48 sumomo Exp $ */
+/* $Id: ngram_read_arpa.c,v 1.16 2009/07/04 14:11:43 sumomo Exp $ */
 
 /* words should be alphabetically sorted */
 
@@ -45,33 +45,32 @@
  * Set number of N-gram entries, for reading the first LR 2-gram.
  * 
  * @param fp [in] file pointer
- * @param num [out] set the values to this buffer
+ * @param numlist [out] set the values to this buffer (malloc)
  *
  * @return the value of N, or -1 on error.
  */
 static int
-get_total_info(FILE *fp, NNID num[])
+get_total_info(FILE *fp, NNID **numlist)
 {
   char *p;
   int n;
   int maxn;
   unsigned long entry_num;
+  int numnum;
 
   maxn = 0;
 
+  numnum = 10;
+  *numlist = (NNID *)mymalloc(sizeof(NNID) * numnum);
+
   while (getl(buf, sizeof(buf), fp) != NULL && buf[0] != '\\') {
     if (strnmatch(buf, "ngram", 5)) { /* n-gram num */
-      p = strtok(buf, "=");
-      n = p[strlen(p)-1] - '0';
-      if (n > MAX_N) {
-	jlog("Error: too long N-gram (N=%d)\n", n);
-	jlog("Error: current maximum length of N-gram is set to %d\n", MAX_N);
-	jlog("Error: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");
-	return -1;
-      }
-      p = strtok(NULL, "=");
+      //p = strtok(buf, " =");
+      //n = atoi(p);
+      //p = strtok(NULL, " =");
       //entry_num = atol(p);
-      sscanf(p, "%lu", &entry_num);
+      //sscanf(p, "%lu", &entry_num);
+      sscanf(buf, "ngram %d = %lu", &n, &entry_num);
       /* check maximum number */
       if (entry_num > NNID_MAX) {
 	jlog("Error: too big %d-gram (exceeds %d bit)\n", n, sizeof(NNID) * 8);
@@ -81,8 +80,12 @@
       if (entry_num == 0) {
 	jlog("Warning: empty %d-gram, skipped\n", n);
       } else {
-	num[n-1] = entry_num;
 	if (maxn < n) maxn = n;
+	if (n >= numnum) {
+	  numnum *= 2;
+	  *numlist = (NNID *)myrealloc(*numlist, sizeof(NNID) * numnum);
+	}
+	(*numlist)[n-1] = entry_num;
       }
     }
   }
@@ -330,8 +333,8 @@
 set_ngram(FILE *fp, NGRAM_INFO *ndata, int n)
 {
   NNID i;
-  WORD_ID w[MAX_N];
-  WORD_ID w_last[MAX_N];
+  WORD_ID *w;
+  WORD_ID *w_last;
   LOGPROB p, bowt;
   NNID nnid;
   NNID cid, cid_last;
@@ -346,6 +349,9 @@
     return FALSE;
   }
 
+  w = (WORD_ID *)mymalloc(sizeof(WORD_ID) * n);
+  w_last = (WORD_ID *)mymalloc(sizeof(WORD_ID) * n);
+
   t = &(ndata->d[n-1]);
   tprev = &(ndata->d[n-2]);
 
@@ -391,6 +397,7 @@
     /* N-gram probability */
     if ((s = strtok(buf, DELM)) == NULL) {
       jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
+      free(w_last); free(w);
       return FALSE;
     }
     p = (LOGPROB)atof(s);
@@ -398,6 +405,7 @@
     for(i=0;i<n;i++) {
       if ((s = strtok(NULL, DELM)) == NULL) {
 	jlog("Error: ngram_read_arpa: %d-gram: failed to parse, corrupted or invalid data?\n", n);
+	free(w_last); free(w);
 	return FALSE;
       }
       if ((w[i] = ngram_lookup_word(ndata, s)) == WORD_INVALID) {
@@ -440,6 +448,7 @@
       if (t->is24bit) {
 	if (t->bgn_upper[cid] != NNID_INVALID_UPPER) {
 	  jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
+	  free(w_last); free(w);
 	  return FALSE;
 	}
 	ntmp = nnid & 0xffff;
@@ -449,6 +458,7 @@
       } else {
 	if (t->bgn[cid] != NNID_INVALID) {
 	  jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
+	  free(w_last); free(w);
 	  return FALSE;
 	}
 	t->bgn[cid] = nnid;
@@ -465,6 +475,7 @@
       continue;
     } else if (w_last[n-1] != WORD_INVALID && w[n-1] < w_last[n-1]) {
       jlog("Error: ngram_read_arpa: %d-gram #%d: \"%s\": word order is not the same as 1-gram\n", n, nnid+1, pbuf);
+      free(w_last); free(w);
       return FALSE;
     }
 
@@ -490,6 +501,7 @@
     /* check total num */
     if (nnid > t->totalnum) {
       jlog("Error: ngram_read_arpa: %d-gram: read num (%d) not match the header value (%d)\n", n, nnid, t->totalnum);
+      free(w_last); free(w);
       return FALSE;
     }
   }
@@ -508,6 +520,7 @@
     jlog("Stat: ngram_read_arpa: %d-gram read %d end\n", n, nnid);
   }
 
+  free(w_last); free(w);
   return ok_p;
 }
 
@@ -525,7 +538,7 @@
 ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, boolean addition)
 {
   int i, n;
-  NNID num[MAX_N];
+  NNID *num;
 
   /* source file is not a binary N-gram */
   ndata->from_bin = FALSE;
@@ -537,15 +550,23 @@
 
   if (addition) {
     /* reading additional forward 2-gram for the 1st pass */
+
+    if (ndata->n < 2) {
+      jlog("Error: base N-gram should be longer than 2-gram\n");
+      return FALSE;
+    }
+
     /* read n-gram total info */
-    n = get_total_info(fp, num);
+    n = get_total_info(fp, &num);
     if (n == -1) {		/* error */
+      free(num);
       return FALSE;
     }
 
     /* check N limit */
     if (n < 2) {
       jlog("Error: forward N-gram for pass1 is does not contain 2-gram\n");
+      free(num);
       return FALSE;
     }
     if (n > 2) {
@@ -558,6 +579,9 @@
 	jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", i+1);
       }
     }
+
+    free(num);
+
     /* read additional 1-gram data */
     if (!strnmatch(buf,"\\1-grams",8)) {
       jlog("Error: ngram_read_arpa: 1-gram not found for additional LR 2-gram\n");
@@ -581,14 +605,18 @@
 
   } else {
     /* read n-gram total info */
-    n = get_total_info(fp, num);
+    n = get_total_info(fp, &num);
     if (n == -1) {		/* error */
+      free(num);
       return FALSE;
     }
     jlog("Stat: ngram_read_arpa: this is %d-gram file\n", n);
+    ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * n);
+    memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * n);
     for(i=0;i<n;i++) {
       ndata->d[i].totalnum = num[i];
     }
+    free(num);
     
     /* set word num */
     if (ndata->d[0].totalnum > MAX_WORD_NUM) {
Index: julius4/libsent/src/ngram/ngram_read_bin.c
diff -u julius4/libsent/src/ngram/ngram_read_bin.c:1.6 julius4/libsent/src/ngram/ngram_read_bin.c:1.7
--- julius4/libsent/src/ngram/ngram_read_bin.c:1.6	Tue Feb 10 17:15:48 2009
+++ julius4/libsent/src/ngram/ngram_read_bin.c	Sat Jul  4 23:11:43 2009
@@ -48,7 +48,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 17:12:08 2005
  *
- * $Revision: 1.6 $
+ * $Revision: 1.7 $
  * 
  */
 /*
@@ -253,14 +253,9 @@
 
   jlog("Stat: ngram_read_bin_v5: this is %s %d-gram file\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ndata->n);
 
-  if (ndata->n > MAX_N) {
-    jlog("Error: ngram_read_bin_v5: too long N-gram (N=%d)\n", n);
-    jlog("Error: ngram_read_bin_v5: current maximum length of N-gram is set to %d\n", MAX_N);
-    jlog("Error: ngram_read_bin_v5: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");
-    return FALSE;
-  }
-
   /* read total info and set max_word_num */
+  ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * ndata->n);
+  memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * ndata->n);
   for(n=0;n<ndata->n;n++) {
     rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
   }
@@ -374,6 +369,8 @@
   ndata->dir = DIR_RL;
 
   /* read total info and set max_word_num */
+  ndata->d = (NGRAM_TUPLE_INFO *)mymalloc(sizeof(NGRAM_TUPLE_INFO) * ndata->n);
+  memset(ndata->d, 0, sizeof(NGRAM_TUPLE_INFO) * ndata->n);
   for(n=0;n<ndata->n;n++) {
     rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
   }



Julius-cvs メーリングリストの案内
アーカイブの一覧に戻る