aboutsummaryrefslogtreecommitdiff
path: root/utils/internal/zc.c
diff options
context:
space:
mode:
Diffstat (limited to 'utils/internal/zc.c')
-rw-r--r--utils/internal/zc.c107
1 files changed, 107 insertions, 0 deletions
diff --git a/utils/internal/zc.c b/utils/internal/zc.c
new file mode 100644
index 0000000..eff94bd
--- /dev/null
+++ b/utils/internal/zc.c
@@ -0,0 +1,107 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+#define PACKAGE "wgram"
+#define VERSION "0.0.4"
+#define MAXLINE 1024
+#define MAXGRAM 32
+
+/* status epilepticus .. print help */
+void print_help(int exval);
+
+int main (int argc, char *argv[]) {
+ /* word delimeter for strtok() */
+ char delim[] = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n";
+ char line[MAXLINE]; /* input buff, fgets() */
+ char *stray = NULL; /* returned value by strtok() */
+ char **strarray = NULL; /* array to hold all entrys */
+ int i = 0; /* general counter */
+ int strcount = 0; /* number of entrys in pointer array */
+ int N = 3, pos = 0; /* ngram size, 3 in this case */
+ int opt = 0; /* holds command line opt nr.. */
+ int word_flag = 0; /* print only the `raw' words */
+ FILE *fp = stdin; /* read input from `FILE', default is stdin */
+
+ while((opt = getopt(argc, argv, "hvn:wf:")) != -1) {
+ switch(opt) {
+ case 'h':
+ print_help(0);
+ break;
+ case 'v':
+ exit(0);
+ break;
+ case 'n':
+ N = atoi(optarg);
+ if(N > MAXGRAM || N < 2) {
+ fprintf(stderr, "%s: Error - Ngram length `%d' out of range `0-%d'\n",
+ PACKAGE, N, MAXGRAM);
+ return 1;
+ }
+ break;
+ case 'w':
+ word_flag = 1;
+ break;
+ case 'f':
+ if(freopen(optarg, "r", fp) == NULL) {
+ fprintf(stderr, "%s: Error - opening `%s'\n", PACKAGE, optarg);
+ return 1;
+ }
+ break;
+ case '?':
+ fprintf(stderr, "%s: Error - No such option: `%c'\n\n", PACKAGE, optopt);
+ print_help(1);
+ } /* switch */
+ } /* while */
+
+ /* start reading lines from file pointer, add all entrys to **strarray */
+ while((fgets(line, MAXLINE, fp)) != NULL) {
+ if(strlen(line) < 2)
+ continue;
+
+ stray = strtok(line, delim);
+ while(stray != NULL) {
+ strarray = (char **)realloc(strarray, (strcount + 1) * sizeof(char *));
+ strarray[strcount++] = strdup(stray);
+ stray = strtok(NULL, delim);
+ }
+ }
+
+ if(word_flag == 0) {
+ /*
+ // print the array of strings, jumping back each time
+ // (N - 1) positions if a whole ngram of words has been printed
+ */
+ for(i = 0, pos = N; i < strcount; i++, pos--) {
+ if(pos == 0) pos = N, i -= (N - 1), printf("\n");
+ printf("%s ", strarray[i]);
+ }
+ printf("\n");
+ } else {
+ /* print raw words */
+ for(i = 0; i < strcount; i++)
+ printf("%s\n", strarray[i]);
+ }
+
+ /* free the string array */
+ for(i = 0; i < strcount; i++)
+ free(strarray[i]);
+
+ free(strarray);
+ return 0;
+}
+
+/* status epilepticus .. print help */
+void print_help(int exval) {
+ printf("%s,%s extract N-grams from text data\n", PACKAGE, VERSION);
+ printf("Usage: %s [-h] [-v] [-n INT] [-w] [-f FILE]\n\n", PACKAGE);
+
+ printf(" -h print this help and exit\n");
+ printf(" -v print version and exit\n\n");
+
+ printf(" -n INT set ngram length (default=3)\n");
+ printf(" -w print only the extracted words\n");
+ printf(" -f FILE read input from `FILE' (default=stdin)\n\n");
+ exit(exval);
+}