You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

362 lines
8.3 KiB

/* License: AGPLv3 or later. https://www.gnu.org/licenses/licenses.html
*
* Assignment 1 - Text Analysis
* Manish
* Student Login: *****
*
* Compile it as:
* gcc -Wall -std=c11 -o ass1 ass1.c
*
* Relevant code has been carried over from Lab Exercises
*
* Word are ordered by decreasing count and than alphabetically in
* ascending order. Eg:
* 15 hello
* 15 world
*/
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct
{
// Word
int start;
/* Not required as next node's start -1 is end.
* For last word, it's text_size -1.
*/
// int end;
int count;
// AVL tree
int left;
int right;
int height;
} node;
// This is the AVL tree
int root = -1;
node tree[50000];
int tree_size = 0;
// All words in string pool. Their index in AVL tree
char text[500000];
int text_size = 0;
/* Alphabetical index is built by traversing through tree.
* Thereafter, sort indexes by count using merge sort.
*/
int indexes[50000];
int indexes_size = 0;
int min(int i, int j);
int max(int i, int j);
int string_compare(int node_i, int index_j);
int new_entry(int start);
int height(int node);
void update_hegiht(int node);
int insert(int start, int node);
int rotate_right(int old_parent);
int rotate_left(int old_parent);
void build_index(int node_index);
void merge_sort();
void print_word(int node);
int main(void)
{
printf("Enter file name: ");
char filename[257];
// Assuming filename/file path won't be longer than 256 characters
scanf("%256s", filename);
FILE* file = fopen(filename, "r");
if (!file)
{
perror(filename);
exit(EXIT_FAILURE);
}
// read word and add node to AVL tree
int word_start = text_size;
int tree_len = tree_size;
char c;
bool in_word = false;
while ((c = fgetc(file)) != EOF)
{
if (isalpha(c))
{
if (!in_word)
{
in_word = true;
word_start = text_size;
}
text[text_size++] = tolower(c);
}
else if (isspace(c))
{
if (in_word)
{
root = insert(word_start, root);
// if word not added because duplicate
if (tree_len == tree_size)
{
text_size = word_start;
}
else
{
tree_len = tree_size;
}
}
in_word = false;
}
}
fclose(file);
build_index(root);
merge_sort();
printf("First 10 words in index:\n");
int upto = min(indexes_size, 10);
for (int i = 0; i < upto; i++)
{
print_word(indexes[i]);
}
printf("\n\nLast 10 words in index:\n");
int from = max(indexes_size - 11, 0);
for (int i = from; i < indexes_size; i++)
{
print_word(indexes[i]);
}
return 0;
}
int min(int i, int j)
{
if (i <= j)
return i;
return j;
}
int max(int i, int j)
{
if (i >= j)
return i;
return j;
}
/* node_i is a word added to tree and index_j is last word in
* text that needs to be compared.
*/
int string_compare(int node_i, int index_j)
{
/* If alphabetically node_i comes before than -1
* if node_i == index_j than 0
* if alphabetically nojde_j comes before than 1
*/
int i = tree[node_i].start;
int j = index_j;
int i_end;
if (node_i + 1 == tree_size) // node_i is last entry
i_end = index_j;
else
i_end = tree[node_i + 1].start; // start of next word in text array
int j_end = text_size;
while (i < i_end && j < j_end)
{
if (text[i] < text[j])
return -1;
if (text[i++] > text[j++])
return 1;
}
// till now both are equal
if (i < i_end) // i is longer
return 1;
if (j < j_end) // j is longer
return -1;
return 0; // Both equal and same length
}
int new_entry(int word_start)
{
tree[tree_size].start = word_start;
tree[tree_size].count = 1;
tree[tree_size].left = -1;
tree[tree_size].right = -1;
tree[tree_size].height = 0;
return tree_size++;
}
int height(int node)
{
if (node >= 0)
{
return tree[node].height;
}
return node; // -1
}
void update_hegiht(int node)
{
int left_height = height(tree[node].left);
int right_height = height(tree[node].right);
tree[node].height = max(left_height, right_height) + 1;
}
int insert(int word_start, int node)
{
if (node < 0)
{
return new_entry(word_start);
}
int comparison = string_compare(node, word_start);
// new_word == our_word
if (comparison == 0)
tree[node].count++;
// new_word < our_word
else if (comparison > 0)
{
tree[node].left = insert(word_start, tree[node].left);
// if tree imbalance
if (height(tree[node].left) - height(tree[node].right) >= 2)
{
if (string_compare(tree[node].left, word_start) > 0) // Case 1
{
node = rotate_right(node);
}
else // Case 2
{
tree[node].left = rotate_left(tree[node].left);
node = rotate_right(node);
}
}
}
// new_word > our_word
else if (comparison < 0)
{
tree[node].right = insert(word_start, tree[node].right);
// if tree imbalance
if (height(tree[node].right) - height(tree[node].left) >= 2)
{
if (string_compare(tree[node].right, word_start) > 0) // Case 3
{
tree[node].right = rotate_right(tree[node].right);
node = rotate_left(node);
}
else // Case 4
{
node = rotate_left(node);
}
}
}
update_hegiht(node);
return node;
}
int rotate_right(int old_parent)
{
int new_parent = tree[old_parent].left;
tree[old_parent].left = tree[new_parent].right;
tree[new_parent].right = old_parent;
update_hegiht(old_parent);
update_hegiht(new_parent);
return new_parent; // to be updated in it's grand parent node
}
int rotate_left(int old_parent)
{
int new_parent = tree[old_parent].right;
tree[old_parent].right = tree[new_parent].left;
tree[new_parent].left = old_parent;
update_hegiht(old_parent);
update_hegiht(new_parent);
return new_parent; // to be updated in it's grand parent node
}
void build_index(int node)
{
if (node < 0)
return;
build_index(tree[node].left);
indexes[indexes_size++] = node;
build_index(tree[node].right);
}
void merge_sort()
{
int temp[indexes_size];
// These pointers will be swapped at every iteration
int* src = indexes;
int* dst = temp;
int* tmp;
int merge_size = 1;
// It looks O(scary!) but is as efficient as other loop based merge sort
while (merge_size < indexes_size)
{
int i = 0;
while (i < indexes_size)
{
int left_upto = min(i + merge_size, indexes_size);
int l = i;
int r = left_upto;
int right_upto = min(i + (2 * merge_size), indexes_size);
while (l < left_upto && r < right_upto)
{
if (tree[src[l]].count >= tree[src[r]].count)
{
dst[i++] = src[l++];
}
else
{
dst[i++] = src[r++];
}
}
for (; l < left_upto; l++)
{
dst[i++] = src[l];
}
for (; r < right_upto; r++)
{
dst[i++] = src[r];
}
}
tmp = src;
src = dst;
dst = tmp;
merge_size *= 2;
}
/* if loop ended with final merge in temp array than copy it back to
* indexes array
*/
if (&indexes[0] != &src[0])
{
for (int i = 0; i < indexes_size; i++)
{
indexes[i] = src[i];
}
}
}
void print_word(int node)
{
int word_length;
if (node + 1 == tree_size) // last word in tree
word_length = text_size - tree[node].start;
else
word_length = tree[node + 1].start - tree[node].start;
printf(
"%d %.*s\n",
tree[node].count,
word_length,
&text[tree[node].start]);
}