diff --git a/Ass1.pdf b/Ass1.pdf new file mode 100644 index 0000000..d234968 Binary files /dev/null and b/Ass1.pdf differ diff --git a/Assignment 1.pdf b/Assignment 1.pdf new file mode 100644 index 0000000..7a804fe Binary files /dev/null and b/Assignment 1.pdf differ diff --git a/ass1.c b/ass1.c new file mode 100644 index 0000000..633e830 --- /dev/null +++ b/ass1.c @@ -0,0 +1,362 @@ +/* License: AGPLv3 or later. https://www.gnu.org/licenses/licenses.html + * + * Assignment 1 - Text Analysis + * Manish + * Student Login: ***** + * + * Compile it as: + * gcc -Wall -std=c11 -o ass1 ass1.c + * + * Relevant code has been carried over from Lab Exercises + * + * Word are ordered by decreasing count and than alphabetically in + * ascending order. Eg: + * 15 hello + * 15 world + */ + +#include +#include +#include +#include +#include + +typedef struct +{ + // Word + int start; + /* Not required as next node's start -1 is end. + * For last word, it's text_size -1. + */ + // int end; + int count; + + // AVL tree + int left; + int right; + int height; +} node; + +// This is the AVL tree +int root = -1; +node tree[50000]; +int tree_size = 0; + +// All words in string pool. Their index in AVL tree +char text[500000]; +int text_size = 0; + +/* Alphabetical index is built by traversing through tree. + * Thereafter, sort indexes by count using merge sort. + */ +int indexes[50000]; +int indexes_size = 0; + +int min(int i, int j); +int max(int i, int j); + +int string_compare(int node_i, int index_j); +int new_entry(int start); +int height(int node); +void update_hegiht(int node); +int insert(int start, int node); +int rotate_right(int old_parent); +int rotate_left(int old_parent); +void build_index(int node_index); + +void merge_sort(); + +void print_word(int node); + +int main(void) +{ + printf("Enter file name: "); + char filename[257]; + // Assuming filename/file path won't be longer than 256 characters + scanf("%256s", filename); + + FILE* file = fopen(filename, "r"); + if (!file) + { + perror(filename); + exit(EXIT_FAILURE); + } + + // read word and add node to AVL tree + int word_start = text_size; + int tree_len = tree_size; + char c; + bool in_word = false; + while ((c = fgetc(file)) != EOF) + { + if (isalpha(c)) + { + if (!in_word) + { + in_word = true; + word_start = text_size; + } + text[text_size++] = tolower(c); + } + else if (isspace(c)) + { + if (in_word) + { + root = insert(word_start, root); + // if word not added because duplicate + if (tree_len == tree_size) + { + text_size = word_start; + } + else + { + tree_len = tree_size; + } + } + in_word = false; + } + } + + fclose(file); + + build_index(root); + + merge_sort(); + + printf("First 10 words in index:\n"); + int upto = min(indexes_size, 10); + for (int i = 0; i < upto; i++) + { + print_word(indexes[i]); + } + printf("\n\nLast 10 words in index:\n"); + int from = max(indexes_size - 11, 0); + for (int i = from; i < indexes_size; i++) + { + print_word(indexes[i]); + } + + return 0; +} + +int min(int i, int j) +{ + if (i <= j) + return i; + return j; +} + +int max(int i, int j) +{ + if (i >= j) + return i; + return j; +} + +/* node_i is a word added to tree and index_j is last word in + * text that needs to be compared. + */ +int string_compare(int node_i, int index_j) +{ + /* If alphabetically node_i comes before than -1 + * if node_i == index_j than 0 + * if alphabetically nojde_j comes before than 1 + */ + int i = tree[node_i].start; + int j = index_j; + int i_end; + if (node_i + 1 == tree_size) // node_i is last entry + i_end = index_j; + else + i_end = tree[node_i + 1].start; // start of next word in text array + int j_end = text_size; + while (i < i_end && j < j_end) + { + if (text[i] < text[j]) + return -1; + if (text[i++] > text[j++]) + return 1; + } + // till now both are equal + if (i < i_end) // i is longer + return 1; + if (j < j_end) // j is longer + return -1; + return 0; // Both equal and same length +} + +int new_entry(int word_start) +{ + tree[tree_size].start = word_start; + tree[tree_size].count = 1; + tree[tree_size].left = -1; + tree[tree_size].right = -1; + tree[tree_size].height = 0; + return tree_size++; +} + +int height(int node) +{ + if (node >= 0) + { + return tree[node].height; + } + return node; // -1 +} + +void update_hegiht(int node) +{ + int left_height = height(tree[node].left); + int right_height = height(tree[node].right); + tree[node].height = max(left_height, right_height) + 1; +} + +int insert(int word_start, int node) +{ + if (node < 0) + { + return new_entry(word_start); + } + int comparison = string_compare(node, word_start); + // new_word == our_word + if (comparison == 0) + tree[node].count++; + // new_word < our_word + else if (comparison > 0) + { + tree[node].left = insert(word_start, tree[node].left); + // if tree imbalance + if (height(tree[node].left) - height(tree[node].right) >= 2) + { + if (string_compare(tree[node].left, word_start) > 0) // Case 1 + { + node = rotate_right(node); + } + else // Case 2 + { + tree[node].left = rotate_left(tree[node].left); + node = rotate_right(node); + } + } + } + // new_word > our_word + else if (comparison < 0) + { + tree[node].right = insert(word_start, tree[node].right); + // if tree imbalance + if (height(tree[node].right) - height(tree[node].left) >= 2) + { + if (string_compare(tree[node].right, word_start) > 0) // Case 3 + { + tree[node].right = rotate_right(tree[node].right); + node = rotate_left(node); + } + else // Case 4 + { + node = rotate_left(node); + } + } + } + update_hegiht(node); + return node; +} + +int rotate_right(int old_parent) +{ + int new_parent = tree[old_parent].left; + tree[old_parent].left = tree[new_parent].right; + tree[new_parent].right = old_parent; + update_hegiht(old_parent); + update_hegiht(new_parent); + return new_parent; // to be updated in it's grand parent node +} + +int rotate_left(int old_parent) +{ + int new_parent = tree[old_parent].right; + tree[old_parent].right = tree[new_parent].left; + tree[new_parent].left = old_parent; + update_hegiht(old_parent); + update_hegiht(new_parent); + return new_parent; // to be updated in it's grand parent node +} + +void build_index(int node) +{ + if (node < 0) + return; + build_index(tree[node].left); + indexes[indexes_size++] = node; + build_index(tree[node].right); +} + +void merge_sort() +{ + int temp[indexes_size]; + + // These pointers will be swapped at every iteration + int* src = indexes; + int* dst = temp; + int* tmp; + + int merge_size = 1; + // It looks O(scary!) but is as efficient as other loop based merge sort + while (merge_size < indexes_size) + { + int i = 0; + while (i < indexes_size) + { + int left_upto = min(i + merge_size, indexes_size); + int l = i; + int r = left_upto; + int right_upto = min(i + (2 * merge_size), indexes_size); + while (l < left_upto && r < right_upto) + { + if (tree[src[l]].count >= tree[src[r]].count) + { + dst[i++] = src[l++]; + } + else + { + dst[i++] = src[r++]; + } + } + for (; l < left_upto; l++) + { + dst[i++] = src[l]; + } + for (; r < right_upto; r++) + { + dst[i++] = src[r]; + } + } + tmp = src; + src = dst; + dst = tmp; + merge_size *= 2; + } + /* if loop ended with final merge in temp array than copy it back to + * indexes array + */ + if (&indexes[0] != &src[0]) + { + for (int i = 0; i < indexes_size; i++) + { + indexes[i] = src[i]; + } + } +} + +void print_word(int node) +{ + int word_length; + if (node + 1 == tree_size) // last word in tree + word_length = text_size - tree[node].start; + else + word_length = tree[node + 1].start - tree[node].start; + printf( + "%d %.*s\n", + tree[node].count, + word_length, + &text[tree[node].start]); +}