363 lines
8.3 KiB
C
363 lines
8.3 KiB
C
/* License: AGPLv3 or later. https://www.gnu.org/licenses/licenses.html
|
|
*
|
|
* Assignment 1 - Text Analysis
|
|
* Manish
|
|
* Student Login: *****
|
|
*
|
|
* Compile it as:
|
|
* gcc -Wall -std=c11 -o ass1 ass1.c
|
|
*
|
|
* Relevant code has been carried over from Lab Exercises
|
|
*
|
|
* Word are ordered by decreasing count and than alphabetically in
|
|
* ascending order. Eg:
|
|
* 15 hello
|
|
* 15 world
|
|
*/
|
|
|
|
#include <ctype.h>
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
typedef struct
|
|
{
|
|
// Word
|
|
int start;
|
|
/* Not required as next node's start -1 is end.
|
|
* For last word, it's text_size -1.
|
|
*/
|
|
// int end;
|
|
int count;
|
|
|
|
// AVL tree
|
|
int left;
|
|
int right;
|
|
int height;
|
|
} node;
|
|
|
|
// This is the AVL tree
|
|
int root = -1;
|
|
node tree[50000];
|
|
int tree_size = 0;
|
|
|
|
// All words in string pool. Their index in AVL tree
|
|
char text[500000];
|
|
int text_size = 0;
|
|
|
|
/* Alphabetical index is built by traversing through tree.
|
|
* Thereafter, sort indexes by count using merge sort.
|
|
*/
|
|
int indexes[50000];
|
|
int indexes_size = 0;
|
|
|
|
int min(int i, int j);
|
|
int max(int i, int j);
|
|
|
|
int string_compare(int node_i, int index_j);
|
|
int new_entry(int start);
|
|
int height(int node);
|
|
void update_hegiht(int node);
|
|
int insert(int start, int node);
|
|
int rotate_right(int old_parent);
|
|
int rotate_left(int old_parent);
|
|
void build_index(int node_index);
|
|
|
|
void merge_sort();
|
|
|
|
void print_word(int node);
|
|
|
|
int main(void)
|
|
{
|
|
printf("Enter file name: ");
|
|
char filename[257];
|
|
// Assuming filename/file path won't be longer than 256 characters
|
|
scanf("%256s", filename);
|
|
|
|
FILE* file = fopen(filename, "r");
|
|
if (!file)
|
|
{
|
|
perror(filename);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// read word and add node to AVL tree
|
|
int word_start = text_size;
|
|
int tree_len = tree_size;
|
|
char c;
|
|
bool in_word = false;
|
|
while ((c = fgetc(file)) != EOF)
|
|
{
|
|
if (isalpha(c))
|
|
{
|
|
if (!in_word)
|
|
{
|
|
in_word = true;
|
|
word_start = text_size;
|
|
}
|
|
text[text_size++] = tolower(c);
|
|
}
|
|
else if (isspace(c))
|
|
{
|
|
if (in_word)
|
|
{
|
|
root = insert(word_start, root);
|
|
// if word not added because duplicate
|
|
if (tree_len == tree_size)
|
|
{
|
|
text_size = word_start;
|
|
}
|
|
else
|
|
{
|
|
tree_len = tree_size;
|
|
}
|
|
}
|
|
in_word = false;
|
|
}
|
|
}
|
|
|
|
fclose(file);
|
|
|
|
build_index(root);
|
|
|
|
merge_sort();
|
|
|
|
printf("First 10 words in index:\n");
|
|
int upto = min(indexes_size, 10);
|
|
for (int i = 0; i < upto; i++)
|
|
{
|
|
print_word(indexes[i]);
|
|
}
|
|
printf("\n\nLast 10 words in index:\n");
|
|
int from = max(indexes_size - 11, 0);
|
|
for (int i = from; i < indexes_size; i++)
|
|
{
|
|
print_word(indexes[i]);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int min(int i, int j)
|
|
{
|
|
if (i <= j)
|
|
return i;
|
|
return j;
|
|
}
|
|
|
|
int max(int i, int j)
|
|
{
|
|
if (i >= j)
|
|
return i;
|
|
return j;
|
|
}
|
|
|
|
/* node_i is a word added to tree and index_j is last word in
|
|
* text that needs to be compared.
|
|
*/
|
|
int string_compare(int node_i, int index_j)
|
|
{
|
|
/* If alphabetically node_i comes before than -1
|
|
* if node_i == index_j than 0
|
|
* if alphabetically nojde_j comes before than 1
|
|
*/
|
|
int i = tree[node_i].start;
|
|
int j = index_j;
|
|
int i_end;
|
|
if (node_i + 1 == tree_size) // node_i is last entry
|
|
i_end = index_j;
|
|
else
|
|
i_end = tree[node_i + 1].start; // start of next word in text array
|
|
int j_end = text_size;
|
|
while (i < i_end && j < j_end)
|
|
{
|
|
if (text[i] < text[j])
|
|
return -1;
|
|
if (text[i++] > text[j++])
|
|
return 1;
|
|
}
|
|
// till now both are equal
|
|
if (i < i_end) // i is longer
|
|
return 1;
|
|
if (j < j_end) // j is longer
|
|
return -1;
|
|
return 0; // Both equal and same length
|
|
}
|
|
|
|
int new_entry(int word_start)
|
|
{
|
|
tree[tree_size].start = word_start;
|
|
tree[tree_size].count = 1;
|
|
tree[tree_size].left = -1;
|
|
tree[tree_size].right = -1;
|
|
tree[tree_size].height = 0;
|
|
return tree_size++;
|
|
}
|
|
|
|
int height(int node)
|
|
{
|
|
if (node >= 0)
|
|
{
|
|
return tree[node].height;
|
|
}
|
|
return node; // -1
|
|
}
|
|
|
|
void update_hegiht(int node)
|
|
{
|
|
int left_height = height(tree[node].left);
|
|
int right_height = height(tree[node].right);
|
|
tree[node].height = max(left_height, right_height) + 1;
|
|
}
|
|
|
|
int insert(int word_start, int node)
|
|
{
|
|
if (node < 0)
|
|
{
|
|
return new_entry(word_start);
|
|
}
|
|
int comparison = string_compare(node, word_start);
|
|
// new_word == our_word
|
|
if (comparison == 0)
|
|
tree[node].count++;
|
|
// new_word < our_word
|
|
else if (comparison > 0)
|
|
{
|
|
tree[node].left = insert(word_start, tree[node].left);
|
|
// if tree imbalance
|
|
if (height(tree[node].left) - height(tree[node].right) >= 2)
|
|
{
|
|
if (string_compare(tree[node].left, word_start) > 0) // Case 1
|
|
{
|
|
node = rotate_right(node);
|
|
}
|
|
else // Case 2
|
|
{
|
|
tree[node].left = rotate_left(tree[node].left);
|
|
node = rotate_right(node);
|
|
}
|
|
}
|
|
}
|
|
// new_word > our_word
|
|
else if (comparison < 0)
|
|
{
|
|
tree[node].right = insert(word_start, tree[node].right);
|
|
// if tree imbalance
|
|
if (height(tree[node].right) - height(tree[node].left) >= 2)
|
|
{
|
|
if (string_compare(tree[node].right, word_start) > 0) // Case 3
|
|
{
|
|
tree[node].right = rotate_right(tree[node].right);
|
|
node = rotate_left(node);
|
|
}
|
|
else // Case 4
|
|
{
|
|
node = rotate_left(node);
|
|
}
|
|
}
|
|
}
|
|
update_hegiht(node);
|
|
return node;
|
|
}
|
|
|
|
int rotate_right(int old_parent)
|
|
{
|
|
int new_parent = tree[old_parent].left;
|
|
tree[old_parent].left = tree[new_parent].right;
|
|
tree[new_parent].right = old_parent;
|
|
update_hegiht(old_parent);
|
|
update_hegiht(new_parent);
|
|
return new_parent; // to be updated in it's grand parent node
|
|
}
|
|
|
|
int rotate_left(int old_parent)
|
|
{
|
|
int new_parent = tree[old_parent].right;
|
|
tree[old_parent].right = tree[new_parent].left;
|
|
tree[new_parent].left = old_parent;
|
|
update_hegiht(old_parent);
|
|
update_hegiht(new_parent);
|
|
return new_parent; // to be updated in it's grand parent node
|
|
}
|
|
|
|
void build_index(int node)
|
|
{
|
|
if (node < 0)
|
|
return;
|
|
build_index(tree[node].left);
|
|
indexes[indexes_size++] = node;
|
|
build_index(tree[node].right);
|
|
}
|
|
|
|
void merge_sort()
|
|
{
|
|
int temp[indexes_size];
|
|
|
|
// These pointers will be swapped at every iteration
|
|
int* src = indexes;
|
|
int* dst = temp;
|
|
int* tmp;
|
|
|
|
int merge_size = 1;
|
|
// It looks O(scary!) but is as efficient as other loop based merge sort
|
|
while (merge_size < indexes_size)
|
|
{
|
|
int i = 0;
|
|
while (i < indexes_size)
|
|
{
|
|
int left_upto = min(i + merge_size, indexes_size);
|
|
int l = i;
|
|
int r = left_upto;
|
|
int right_upto = min(i + (2 * merge_size), indexes_size);
|
|
while (l < left_upto && r < right_upto)
|
|
{
|
|
if (tree[src[l]].count >= tree[src[r]].count)
|
|
{
|
|
dst[i++] = src[l++];
|
|
}
|
|
else
|
|
{
|
|
dst[i++] = src[r++];
|
|
}
|
|
}
|
|
for (; l < left_upto; l++)
|
|
{
|
|
dst[i++] = src[l];
|
|
}
|
|
for (; r < right_upto; r++)
|
|
{
|
|
dst[i++] = src[r];
|
|
}
|
|
}
|
|
tmp = src;
|
|
src = dst;
|
|
dst = tmp;
|
|
merge_size *= 2;
|
|
}
|
|
/* if loop ended with final merge in temp array than copy it back to
|
|
* indexes array
|
|
*/
|
|
if (&indexes[0] != &src[0])
|
|
{
|
|
for (int i = 0; i < indexes_size; i++)
|
|
{
|
|
indexes[i] = src[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
void print_word(int node)
|
|
{
|
|
int word_length;
|
|
if (node + 1 == tree_size) // last word in tree
|
|
word_length = text_size - tree[node].start;
|
|
else
|
|
word_length = tree[node + 1].start - tree[node].start;
|
|
printf(
|
|
"%d %.*s\n",
|
|
tree[node].count,
|
|
word_length,
|
|
&text[tree[node].start]);
|
|
}
|