offline 3
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#define MAX_DOCS 50
#define MAX_LEN 5000
#define MAX_TOKENS 500
#define MAX_TOKEN_LEN 50
#define NUM_STOP_WORDS 8
char documents[MAX_DOCS][MAX_LEN];
char tokens[MAX_TOKENS][MAX_TOKEN_LEN];
char stop_words[NUM_STOP_WORDS][MAX_TOKEN_LEN] = {
"the", "is", "a", "an", "and", "in", "of", "to"
};
int num_docs = 0;
int token_count = 0;
// all functions name
void remove_newline(char str[]);
int is_stop_word(char word[]);
void set_documents();
void normalize_case_all();
void tokenize_all();
void remove_stop_words_all();
void stem_all_tokens();
double compute_tf(char word[], int doc_id);
double compute_idf(char word[]);
void compute_tfidf_all(char word[]);
void display_stat();
void help();
int main() {
char command[50];
int state = 0;
printf("Type 'help' to see available commands.\n");
while (1) {
printf("\nEnter command: ");
fgets(command, sizeof(command), stdin);
remove_newline(command);
if (strcmp(command, "set") == 0) {
set_documents();
state = 1;
continue;
}
if (strcmp(command, "help") == 0) {
help();
continue;
}
if (state == 0) {
if (strcmp(command, "exit") == 0) {
printf("Exiting the program.\n");
break;
} else {
printf("Please enter 'set' to input documents or 'help' to see commands.\n");
}
continue;
}
if (state == 1) {
if (strcmp(command, "preprocess") == 0) {
if (num_docs == 0) {
printf("No documents set.\n");
continue;
}
normalize_case_all();
tokenize_all();
remove_stop_words_all();
stem_all_tokens();
state = 2;
} else if (strcmp(command, "exit") == 0) {
printf("Exiting the program.\n");
break;
} else {
printf("Please enter 'preprocess' after 'set'.\n");
}
continue;
}
if (strcmp(command, "tf") == 0) {
if (num_docs == 0) {
printf("No documents set.\n");
continue;
}
char word[MAX_TOKEN_LEN];
printf("Enter word to compute TF: ");
fgets(word, MAX_TOKEN_LEN, stdin);
remove_newline(word);
for (int i = 0; word[i]; i++) word[i] = tolower(word[i]);
printf("TF:\n");
for (int i = 0; i < num_docs; i++) {
printf("Document %d: %.4lf\n", i + 1, compute_tf(word, i));
}
} else if (strcmp(command, "idf") == 0) {
if (num_docs == 0) {
printf("No documents set.\n");
continue;
}
char word[MAX_TOKEN_LEN];
printf("Enter word to compute IDF: ");
fgets(word, MAX_TOKEN_LEN, stdin);
remove_newline(word);
for (int i = 0; word[i]; i++) word[i] = tolower(word[i]);
printf("IDF for %c: %.4lf\n",word, compute_idf(word));
} else if (strcmp(command, "tfidf") == 0) {
if (num_docs == 0) {
printf("No documents set.\n");
continue;
}
char word[MAX_TOKEN_LEN];
printf("Enter word to compute TF-IDF: ");
fgets(word, MAX_TOKEN_LEN, stdin);
remove_newline(word);
for (int i = 0; word[i]; i++) word[i] = tolower(word[i]);
printf("TF-IDF:\n");
compute_tfidf_all(word);
} else if (strcmp(command, "stat") == 0) {
display_stat();
} else if (strcmp(command, "exit") == 0) {
break;
} else {
printf("Unknown command.\n");
}
}
return 0;
}
void remove_newline(char str[]) {
str[strcspn(str, "\n")] = '\0';
}
int is_stop_word(char word[]) {
for (int i = 0; i < NUM_STOP_WORDS; i++) {
if (strcmp(word, stop_words[i]) == 0) {
return 1;
}
}
return 0;
}
void set_documents() {
printf("Enter number of documents (1-%d): ", MAX_DOCS);
scanf("%d", &num_docs);
getchar();
if (num_docs <= 0 || num_docs > MAX_DOCS) {
printf("Invalid number of documents. Must be from 1 to %d\n", MAX_DOCS);
num_docs = 0;
return;
}
for (int i = 0; i < num_docs; i++) {
printf("Enter Document %d: ", i + 1);
if (fgets(documents[i], MAX_LEN, stdin) == NULL) {
printf("Error reading document.\n");
i--;
continue;
}
remove_newline(documents[i]);
if (strlen(documents[i]) >= MAX_LEN - 1) {
printf("Document too long\n");
int ch;
while ((ch = getchar()) != '\n' && ch != EOF);
i--;
continue;
}
if (strlen(documents[i]) == 0) {
printf("Document %d is empty.\n", i + 1);
i--;
continue;
}
}
printf("Documents set successfully.");
printf("Please enter 'preprocess' command to continue. It will not take other commands.\n");
}
void normalize_case_all() {
for (int i = 0; i < num_docs; i++) {
for (int j = 0; documents[i][j]; j++) {
documents[i][j] = tolower(documents[i][j]);
}
}
printf("\nNormalized Documents:\n");
for (int i = 0; i < num_docs; i++) {
printf("Document %d: %s\n", i + 1, documents[i]);
}
printf("Documents normalized.\n");
}
void tokenize_all() {
token_count = 0;
for (int i = 0; i < num_docs; i++) {
char temp[MAX_LEN];
strcpy(temp, documents[i]);
int len = strlen(temp);
char word[MAX_TOKEN_LEN] = "";
int k = 0;
for (int j = 0; j <= len; j++) {
if (isalnum(temp[j])) {
word[k++] = temp[j];
} else if (temp[j] == ' ' || temp[j] == ',' || temp[j] == '.' ||
temp[j] == ':' || temp[j] == ';' || temp[j] == '?' || temp[j] == '!') {
if (k > 0) {
word[k] = '\0';
if (token_count < MAX_TOKENS) {
strcpy(tokens[token_count++], word);
} else {
printf("Token limit reached");
break;
}
k = 0;
}
}
}
}
printf("\nTokens:\n");
for (int i = 0; i < token_count; i++) {
printf("%d. %s\n", i + 1, tokens[i]);
}
printf("Tokenization complete. Total tokens: %d\n", token_count);
}
void remove_stop_words_all() {
char new_tokens[MAX_TOKENS][MAX_TOKEN_LEN];
int new_count = 0;
for (int i = 0; i < token_count; i++) {
if (!is_stop_word(tokens[i])) {
strcpy(new_tokens[new_count++], tokens[i]);
}
}
token_count = new_count;
for (int i = 0; i < token_count; i++) {
strcpy(tokens[i], new_tokens[i]);
}
printf("\nTokens after stop-word removal:\n");
for (int i = 0; i < token_count; i++) {
printf("%d. %s\n", i + 1, tokens[i]);
}
printf("Stop-word removal complete. Tokens remaining: %d\n", token_count);
}
void stem_all_tokens() {
for (int i = 0; i < token_count; i++) {
int len = strlen(tokens[i]);
if (len > 3 && strcmp(tokens[i] + len - 3, "ing") == 0) {
tokens[i][len - 3] = '\0';
} else if (len > 2 && strcmp(tokens[i] + len - 2, "ed") == 0) {
tokens[i][len - 2] = '\0';
} else if (len > 1 && tokens[i][len - 1] == 's') {
tokens[i][len - 1] = '\0';
}
}
printf("\nStemmed Tokens:\n");
for (int i = 0; i < token_count; i++) {
printf("%d. %s\n", i + 1, tokens[i]);
}
printf("Stemming complete. Total stemmed tokens: %d\n", token_count);
}
double compute_tf(char word[], int doc_id) {
if (doc_id < 0 || doc_id >= num_docs) return 0.0;
int count_word = 0, total_words = 0;
char temp[MAX_LEN];
strcpy(temp, documents[doc_id]);
for (int i = 0; temp[i]; i++) temp[i] = tolower(temp[i]);
char current[MAX_TOKEN_LEN];
int k = 0, len = strlen(temp);
for (int i = 0; i <= len; i++) {
if (isalnum(temp[i])) {
current[k++] = temp[i];
} else if (temp[i] == ' ' || temp[i] == ',' || temp[i] == '.' ||
temp[i] == ':' || temp[i] == ';' || temp[i] == '?' || temp[i] == '!') {
if (k > 0) {
current[k] = '\0';
if (!is_stop_word(current)) {
char stemmed[MAX_TOKEN_LEN];
strcpy(stemmed, current);
int l = strlen(stemmed);
if (l > 3 && strcmp(stemmed + l - 3, "ing") == 0) stemmed[l - 3] = '\0';
else if (l > 2 && strcmp(stemmed + l - 2, "ed") == 0) stemmed[l - 2] = '\0';
else if (l > 1 && stemmed[l - 1] == 's') stemmed[l - 1] = '\0';
if (strcmp(stemmed, word) == 0) count_word++;
total_words++;
}
k = 0;
}
}
}
return total_words == 0 ? 0.0 : (double)count_word / total_words;
}
double compute_idf(char word[]) {
if (num_docs == 0) return 0.0;
int docs_with_word = 0;
for (int doc_id = 0; doc_id < num_docs; doc_id++) {
char temp[MAX_LEN];
strcpy(temp, documents[doc_id]);
for (int i = 0; temp[i]; i++) temp[i] = tolower(temp[i]);
char current[MAX_TOKEN_LEN];
int k = 0, len = strlen(temp);
int found = 0;
for (int i = 0; i <= len; i++) {
if (isalnum(temp[i])) {
current[k++] = temp[i];
} else if (temp[i] == ' ' || temp[i] == ',' || temp[i] == '.' ||
temp[i] == ':' || temp[i] == ';' || temp[i] == '?' || temp[i] == '!') {
if (k > 0) {
current[k] = '\0';
if (!is_stop_word(current)) {
char stemmed[MAX_TOKEN_LEN];
strcpy(stemmed, current);
int l = strlen(stemmed);
if (l > 3 && strcmp(stemmed + l - 3, "ing") == 0) stemmed[l - 3] = '\0';
else if (l > 2 && strcmp(stemmed + l - 2, "ed") == 0) stemmed[l - 2] = '\0';
else if (l > 1 && stemmed[l - 1] == 's') stemmed[l - 1] = '\0';
if (strcmp(stemmed, word) == 0) {
found = 1;
break;
}
}
k = 0;
}
}
}
if (found) docs_with_word++;
}
return log10((double)MAX_DOCS / (1 + docs_with_word));
}
void compute_tfidf_all(char word[]) {
double idf = compute_idf(word);
for (int i = 0; i < num_docs; i++) {
double tf = compute_tf(word, i);
printf("Document %d: %.4lf\n", i + 1, tf * idf);
}
}
void display_stat() {
if (num_docs == 0) {
printf("No documents set. Use 'set' command first.\n");
return;
}
tokenize_all();
remove_stop_words_all();
stem_all_tokens();
char unique[MAX_TOKENS][MAX_TOKEN_LEN];
int unique_count = 0;
for (int i = 0; i < token_count; i++) {
int found = 0;
for (int j = 0; j < unique_count; j++) {
if (strcmp(tokens[i], unique[j]) == 0) {
found = 1;
break;
}
}
if (!found) strcpy(unique[unique_count++], tokens[i]);
}
for (int i = 0; i < unique_count - 1; i++) {
for (int j = i + 1; j < unique_count; j++) {
if (strcmp(unique[i], unique[j]) > 0) {
char temp[MAX_TOKEN_LEN];
strcpy(temp, unique[i]);
strcpy(unique[i], unique[j]);
strcpy(unique[j], temp);
}
}
}
printf("\n============== TF ==============\n");
printf("%-12s", "");
for (int j = 0; j < num_docs; j++) printf("\tdoc%d", j + 1);
printf("\n");
for (int i = 0; i < unique_count; i++) {
printf("%-12s", unique[i]);
for (int j = 0; j < num_docs; j++) {
printf("\t%.4lf", compute_tf(unique[i], j));
}
printf("\n");
}
printf("\n=============== IDF ===============\n");
for (int i = 0; i < unique_count; i++) {
printf("%-12s %.4lf\n", unique[i], compute_idf(unique[i]));
}
printf("\n============= TF-IDF ==============\n");
printf("%-12s", "");
for (int j = 0; j < num_docs; j++) printf("\tdoc%d", j + 1);
printf("\n");
for (int i = 0; i < unique_count; i++) {
printf("%-12s", unique[i]);
double idf = compute_idf(unique[i]);
for (int j = 0; j < num_docs; j++) {
printf("\t%.4lf", compute_tf(unique[i], j) * idf);
}
printf("\n");
}
}
void help() {
printf("\nAvailable commands:\n");
printf("%-10s: Input documents\n", "set");
printf("%-10s: Normalize, tokenize, remove stop-words, stem\n", "preprocess");
printf("%-10s: Compute Term Frequency for a word\n", "tf");
printf("%-10s: Compute Inverse Document Frequency for a word\n", "idf");
printf("%-10s: Compute TF-IDF for a word\n", "tfidf");
printf("%-10s: Display TF, IDF, TF-IDF for all tokens\n", "stat");
printf("%-10s: Show commands\n", "help");
printf("%-10s: Exit the program\n", "exit");
}
Comments
Post a Comment