/* License: AGPLv3 or later. https://www.gnu.org/licenses/licenses.html * * Assignment 1 - Text Analysis * Manish * Student Login: ***** * * Compile it as: * gcc -Wall -std=c11 -o ass1 ass1.c * * Relevant code has been carried over from Lab Exercises * * Word are ordered by decreasing count and than alphabetically in * ascending order. Eg: * 15 hello * 15 world */ #include #include #include #include #include typedef struct { // Word int start; /* Not required as next node's start -1 is end. * For last word, it's text_size -1. */ // int end; int count; // AVL tree int left; int right; int height; } node; // This is the AVL tree int root = -1; node tree[50000]; int tree_size = 0; // All words in string pool. Their index in AVL tree char text[500000]; int text_size = 0; /* Alphabetical index is built by traversing through tree. * Thereafter, sort indexes by count using merge sort. */ int indexes[50000]; int indexes_size = 0; int min(int i, int j); int max(int i, int j); int string_compare(int node_i, int index_j); int new_entry(int start); int height(int node); void update_hegiht(int node); int insert(int start, int node); int rotate_right(int old_parent); int rotate_left(int old_parent); void build_index(int node_index); void merge_sort(); void print_word(int node); int main(void) { printf("Enter file name: "); char filename[257]; // Assuming filename/file path won't be longer than 256 characters scanf("%256s", filename); FILE* file = fopen(filename, "r"); if (!file) { perror(filename); exit(EXIT_FAILURE); } // read word and add node to AVL tree int word_start = text_size; int tree_len = tree_size; char c; bool in_word = false; while ((c = fgetc(file)) != EOF) { if (isalpha(c)) { if (!in_word) { in_word = true; word_start = text_size; } text[text_size++] = tolower(c); } else if (isspace(c)) { if (in_word) { root = insert(word_start, root); // if word not added because duplicate if (tree_len == tree_size) { text_size = word_start; } else { tree_len = tree_size; } } in_word = false; } } fclose(file); build_index(root); merge_sort(); printf("First 10 words in index:\n"); int upto = min(indexes_size, 10); for (int i = 0; i < upto; i++) { print_word(indexes[i]); } printf("\n\nLast 10 words in index:\n"); int from = max(indexes_size - 11, 0); for (int i = from; i < indexes_size; i++) { print_word(indexes[i]); } return 0; } int min(int i, int j) { if (i <= j) return i; return j; } int max(int i, int j) { if (i >= j) return i; return j; } /* node_i is a word added to tree and index_j is last word in * text that needs to be compared. */ int string_compare(int node_i, int index_j) { /* If alphabetically node_i comes before than -1 * if node_i == index_j than 0 * if alphabetically nojde_j comes before than 1 */ int i = tree[node_i].start; int j = index_j; int i_end; if (node_i + 1 == tree_size) // node_i is last entry i_end = index_j; else i_end = tree[node_i + 1].start; // start of next word in text array int j_end = text_size; while (i < i_end && j < j_end) { if (text[i] < text[j]) return -1; if (text[i++] > text[j++]) return 1; } // till now both are equal if (i < i_end) // i is longer return 1; if (j < j_end) // j is longer return -1; return 0; // Both equal and same length } int new_entry(int word_start) { tree[tree_size].start = word_start; tree[tree_size].count = 1; tree[tree_size].left = -1; tree[tree_size].right = -1; tree[tree_size].height = 0; return tree_size++; } int height(int node) { if (node >= 0) { return tree[node].height; } return node; // -1 } void update_hegiht(int node) { int left_height = height(tree[node].left); int right_height = height(tree[node].right); tree[node].height = max(left_height, right_height) + 1; } int insert(int word_start, int node) { if (node < 0) { return new_entry(word_start); } int comparison = string_compare(node, word_start); // new_word == our_word if (comparison == 0) tree[node].count++; // new_word < our_word else if (comparison > 0) { tree[node].left = insert(word_start, tree[node].left); // if tree imbalance if (height(tree[node].left) - height(tree[node].right) >= 2) { if (string_compare(tree[node].left, word_start) > 0) // Case 1 { node = rotate_right(node); } else // Case 2 { tree[node].left = rotate_left(tree[node].left); node = rotate_right(node); } } } // new_word > our_word else if (comparison < 0) { tree[node].right = insert(word_start, tree[node].right); // if tree imbalance if (height(tree[node].right) - height(tree[node].left) >= 2) { if (string_compare(tree[node].right, word_start) > 0) // Case 3 { tree[node].right = rotate_right(tree[node].right); node = rotate_left(node); } else // Case 4 { node = rotate_left(node); } } } update_hegiht(node); return node; } int rotate_right(int old_parent) { int new_parent = tree[old_parent].left; tree[old_parent].left = tree[new_parent].right; tree[new_parent].right = old_parent; update_hegiht(old_parent); update_hegiht(new_parent); return new_parent; // to be updated in it's grand parent node } int rotate_left(int old_parent) { int new_parent = tree[old_parent].right; tree[old_parent].right = tree[new_parent].left; tree[new_parent].left = old_parent; update_hegiht(old_parent); update_hegiht(new_parent); return new_parent; // to be updated in it's grand parent node } void build_index(int node) { if (node < 0) return; build_index(tree[node].left); indexes[indexes_size++] = node; build_index(tree[node].right); } void merge_sort() { int temp[indexes_size]; // These pointers will be swapped at every iteration int* src = indexes; int* dst = temp; int* tmp; int merge_size = 1; // It looks O(scary!) but is as efficient as other loop based merge sort while (merge_size < indexes_size) { int i = 0; while (i < indexes_size) { int left_upto = min(i + merge_size, indexes_size); int l = i; int r = left_upto; int right_upto = min(i + (2 * merge_size), indexes_size); while (l < left_upto && r < right_upto) { if (tree[src[l]].count >= tree[src[r]].count) { dst[i++] = src[l++]; } else { dst[i++] = src[r++]; } } for (; l < left_upto; l++) { dst[i++] = src[l]; } for (; r < right_upto; r++) { dst[i++] = src[r]; } } tmp = src; src = dst; dst = tmp; merge_size *= 2; } /* if loop ended with final merge in temp array than copy it back to * indexes array */ if (&indexes[0] != &src[0]) { for (int i = 0; i < indexes_size; i++) { indexes[i] = src[i]; } } } void print_word(int node) { int word_length; if (node + 1 == tree_size) // last word in tree word_length = text_size - tree[node].start; else word_length = tree[node + 1].start - tree[node].start; printf( "%d %.*s\n", tree[node].count, word_length, &text[tree[node].start]); }