offline 3

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <math.h>


#define MAX_DOCS 50
#define MAX_LEN 5000
#define MAX_TOKENS 500
#define MAX_TOKEN_LEN 50
#define NUM_STOP_WORDS 8

char documents[MAX_DOCS][MAX_LEN]; 
char tokens[MAX_TOKENS][MAX_TOKEN_LEN]; 
char stop_words[NUM_STOP_WORDS][MAX_TOKEN_LEN] = {
    "the", "is", "a", "an", "and", "in", "of", "to"
};
int num_docs = 0; 
int token_count = 0; 

// all functions name
void remove_newline(char str[]);
int is_stop_word(char word[]);
void set_documents();
void normalize_case_all();
void tokenize_all();
void remove_stop_words_all();
void stem_all_tokens();
double compute_tf(char word[], int doc_id);
double compute_idf(char word[]);
void compute_tfidf_all(char word[]);
void display_stat();
void help();

int main() {
    char command[50];
    int state = 0; 

    printf("Type 'help' to see available commands.\n");

    while (1) {
        printf("\nEnter command: ");
        fgets(command, sizeof(command), stdin);
        remove_newline(command);

        if (strcmp(command, "set") == 0) {
            set_documents();
            state = 1;
            continue;
        }
        if (strcmp(command, "help") == 0) {
            help();
            continue;
        }

        if (state == 0) {
            if (strcmp(command, "exit") == 0) {
                printf("Exiting the program.\n");
                break;
            } else {
                printf("Please enter 'set' to input documents or 'help' to see commands.\n");
            }
            continue;
        }

        if (state == 1) {
            if (strcmp(command, "preprocess") == 0) {
                if (num_docs == 0) {
                    printf("No documents set.\n");
                    continue;
                }
                normalize_case_all();
                tokenize_all();
                remove_stop_words_all();
                stem_all_tokens();
                state = 2;
            } else if (strcmp(command, "exit") == 0) {
                printf("Exiting the program.\n");
                break;
            } else {
                printf("Please enter 'preprocess' after 'set'.\n");
            }
            continue;
        }

        if (strcmp(command, "tf") == 0) {
            if (num_docs == 0) {
                printf("No documents set.\n");
                continue;
            }
            char word[MAX_TOKEN_LEN];
            printf("Enter word to compute TF: ");
            fgets(word, MAX_TOKEN_LEN, stdin);
            remove_newline(word);
            for (int i = 0; word[i]; i++) word[i] = tolower(word[i]);

            printf("TF:\n");
            for (int i = 0; i < num_docs; i++) {
                printf("Document %d: %.4lf\n", i + 1, compute_tf(word, i));
            }
        } else if (strcmp(command, "idf") == 0) {
            if (num_docs == 0) {
                printf("No documents set.\n");
                continue;
            }
            char word[MAX_TOKEN_LEN];
            printf("Enter word to compute IDF: ");
            fgets(word, MAX_TOKEN_LEN, stdin);
            remove_newline(word);
            for (int i = 0; word[i]; i++) word[i] = tolower(word[i]);

            printf("IDF for %c: %.4lf\n",word, compute_idf(word));
        } else if (strcmp(command, "tfidf") == 0) {
            if (num_docs == 0) {
                printf("No documents set.\n");
                continue;
            }
            char word[MAX_TOKEN_LEN];
            printf("Enter word to compute TF-IDF: ");
            fgets(word, MAX_TOKEN_LEN, stdin);
            remove_newline(word);
            for (int i = 0; word[i]; i++) word[i] = tolower(word[i]);

            printf("TF-IDF:\n");
            compute_tfidf_all(word);
        } else if (strcmp(command, "stat") == 0) {
            display_stat();
        } else if (strcmp(command, "exit") == 0) {
            break;
        } else {
            printf("Unknown command.\n");
        }
    }

    return 0;
}

void remove_newline(char str[]) {
    str[strcspn(str, "\n")] = '\0';
}


int is_stop_word(char word[]) {
    for (int i = 0; i < NUM_STOP_WORDS; i++) {
        if (strcmp(word, stop_words[i]) == 0) {
            return 1;
        }
    }
    return 0; 
}


void set_documents() {
    printf("Enter number of documents (1-%d): ", MAX_DOCS);
    scanf("%d", &num_docs);
    getchar();

    if (num_docs <= 0 || num_docs > MAX_DOCS) {
        printf("Invalid number of documents. Must be from 1 to %d\n", MAX_DOCS);
        num_docs = 0;
        return;
    }

    for (int i = 0; i < num_docs; i++) {
        printf("Enter Document %d: ", i + 1);
        if (fgets(documents[i], MAX_LEN, stdin) == NULL) {
            printf("Error reading document.\n");
            i--;
            continue;
        }

        remove_newline(documents[i]);

        if (strlen(documents[i]) >= MAX_LEN - 1) {
            printf("Document too long\n");
            int ch;
            while ((ch = getchar()) != '\n' && ch != EOF); 
            i--;
            continue;
        }

        if (strlen(documents[i]) == 0) {
            printf("Document %d is empty.\n", i + 1);
            i--;
            continue;
        }
    }

    printf("Documents set successfully.");
    printf("Please enter 'preprocess' command to continue.  It will not take other commands.\n");
}


void normalize_case_all() {
    for (int i = 0; i < num_docs; i++) {
        for (int j = 0; documents[i][j]; j++) {
            documents[i][j] = tolower(documents[i][j]);
        }
    }

    printf("\nNormalized Documents:\n");
    for (int i = 0; i < num_docs; i++) {
        printf("Document %d: %s\n", i + 1, documents[i]);
    }
    printf("Documents normalized.\n");
}



void tokenize_all() {
    token_count = 0;

    for (int i = 0; i < num_docs; i++) {
        char temp[MAX_LEN];
        strcpy(temp, documents[i]);

        int len = strlen(temp);
        char word[MAX_TOKEN_LEN] = "";
        int k = 0;

        for (int j = 0; j <= len; j++) {
            if (isalnum(temp[j])) {
                word[k++] = temp[j];
            } else if (temp[j] == ' ' || temp[j] == ',' || temp[j] == '.' ||
                       temp[j] == ':' || temp[j] == ';' || temp[j] == '?' || temp[j] == '!') {
                if (k > 0) {
                    word[k] = '\0';
                    if (token_count < MAX_TOKENS) {
                        strcpy(tokens[token_count++], word);
                    } else {
                        printf("Token limit reached");
                        break;
                    }
                    k = 0;
                }
            }
        }
    }

    printf("\nTokens:\n");
    for (int i = 0; i < token_count; i++) {
        printf("%d. %s\n", i + 1, tokens[i]);
    }
    printf("Tokenization complete. Total tokens: %d\n", token_count);
}

void remove_stop_words_all() {
    char new_tokens[MAX_TOKENS][MAX_TOKEN_LEN];
    int new_count = 0;

    for (int i = 0; i < token_count; i++) {
        if (!is_stop_word(tokens[i])) {
            strcpy(new_tokens[new_count++], tokens[i]);
        }
    }

    token_count = new_count;
    for (int i = 0; i < token_count; i++) {
        strcpy(tokens[i], new_tokens[i]);
    }

    printf("\nTokens after stop-word removal:\n");
    for (int i = 0; i < token_count; i++) {
        printf("%d. %s\n", i + 1, tokens[i]);
    }
    printf("Stop-word removal complete. Tokens remaining: %d\n", token_count);
}

void stem_all_tokens() {
    for (int i = 0; i < token_count; i++) {
        int len = strlen(tokens[i]);
        if (len > 3 && strcmp(tokens[i] + len - 3, "ing") == 0) {
            tokens[i][len - 3] = '\0';
        } else if (len > 2 && strcmp(tokens[i] + len - 2, "ed") == 0) {
            tokens[i][len - 2] = '\0';
        } else if (len > 1 && tokens[i][len - 1] == 's') {
            tokens[i][len - 1] = '\0';
        }
    }

    printf("\nStemmed Tokens:\n");
    for (int i = 0; i < token_count; i++) {
        printf("%d. %s\n", i + 1, tokens[i]);
    }
    printf("Stemming complete. Total stemmed tokens: %d\n", token_count);
}


double compute_tf(char word[], int doc_id) {
    if (doc_id < 0 || doc_id >= num_docs) return 0.0;

    int count_word = 0, total_words = 0;
    char temp[MAX_LEN];
    strcpy(temp, documents[doc_id]);

    for (int i = 0; temp[i]; i++) temp[i] = tolower(temp[i]);

    char current[MAX_TOKEN_LEN];
    int k = 0, len = strlen(temp);

    for (int i = 0; i <= len; i++) {
        if (isalnum(temp[i])) {
            current[k++] = temp[i];
        } else if (temp[i] == ' ' || temp[i] == ',' || temp[i] == '.' ||
                   temp[i] == ':' || temp[i] == ';' || temp[i] == '?' || temp[i] == '!') {
            if (k > 0) {
                current[k] = '\0';
                if (!is_stop_word(current)) {
                    char stemmed[MAX_TOKEN_LEN];
                    strcpy(stemmed, current);
                    int l = strlen(stemmed);
                    if (l > 3 && strcmp(stemmed + l - 3, "ing") == 0) stemmed[l - 3] = '\0';
                    else if (l > 2 && strcmp(stemmed + l - 2, "ed") == 0) stemmed[l - 2] = '\0';
                    else if (l > 1 && stemmed[l - 1] == 's') stemmed[l - 1] = '\0';

                    if (strcmp(stemmed, word) == 0) count_word++;
                    total_words++;
                }
                k = 0;
            }
        }
    }

    return total_words == 0 ? 0.0 : (double)count_word / total_words;
}


double compute_idf(char word[]) {
    if (num_docs == 0) return 0.0;

    int docs_with_word = 0;

    for (int doc_id = 0; doc_id < num_docs; doc_id++) {
        char temp[MAX_LEN];
        strcpy(temp, documents[doc_id]);

        for (int i = 0; temp[i]; i++) temp[i] = tolower(temp[i]);

        char current[MAX_TOKEN_LEN];
        int k = 0, len = strlen(temp);
        int found = 0;

        for (int i = 0; i <= len; i++) {
            if (isalnum(temp[i])) {
                current[k++] = temp[i];
            } else if (temp[i] == ' ' || temp[i] == ',' || temp[i] == '.' ||
                       temp[i] == ':' || temp[i] == ';' || temp[i] == '?' || temp[i] == '!') {
                if (k > 0) {
                    current[k] = '\0';
                    if (!is_stop_word(current)) {
                        char stemmed[MAX_TOKEN_LEN];
                        strcpy(stemmed, current);
                        int l = strlen(stemmed);
                        if (l > 3 && strcmp(stemmed + l - 3, "ing") == 0) stemmed[l - 3] = '\0';
                        else if (l > 2 && strcmp(stemmed + l - 2, "ed") == 0) stemmed[l - 2] = '\0';
                        else if (l > 1 && stemmed[l - 1] == 's') stemmed[l - 1] = '\0';

                        if (strcmp(stemmed, word) == 0) {
                            found = 1;
                            break;
                        }
                    }
                    k = 0;
                }
            }
        }

        if (found) docs_with_word++;
    }

    return log10((double)MAX_DOCS / (1 + docs_with_word));
}


void compute_tfidf_all(char word[]) {
    double idf = compute_idf(word);

    for (int i = 0; i < num_docs; i++) {
        double tf = compute_tf(word, i);
        printf("Document %d: %.4lf\n", i + 1, tf * idf);
    }
}


void display_stat() {
    if (num_docs == 0) {
        printf("No documents set. Use 'set' command first.\n");
        return;
    }

    tokenize_all();
    remove_stop_words_all();
    stem_all_tokens();


    char unique[MAX_TOKENS][MAX_TOKEN_LEN];
    int unique_count = 0;

    for (int i = 0; i < token_count; i++) {
        int found = 0;
        for (int j = 0; j < unique_count; j++) {
            if (strcmp(tokens[i], unique[j]) == 0) {
                found = 1;
                break;
            }
        }
        if (!found) strcpy(unique[unique_count++], tokens[i]);
    }

  
    for (int i = 0; i < unique_count - 1; i++) {
        for (int j = i + 1; j < unique_count; j++) {
            if (strcmp(unique[i], unique[j]) > 0) {
                char temp[MAX_TOKEN_LEN];
                strcpy(temp, unique[i]);
                strcpy(unique[i], unique[j]);
                strcpy(unique[j], temp);
            }
        }
    }


    printf("\n============== TF ==============\n");
    printf("%-12s", "");
    for (int j = 0; j < num_docs; j++) printf("\tdoc%d", j + 1);
    printf("\n");
    for (int i = 0; i < unique_count; i++) {
        printf("%-12s", unique[i]);
        for (int j = 0; j < num_docs; j++) {
            printf("\t%.4lf", compute_tf(unique[i], j));
        }
        printf("\n");
    }

   
    printf("\n=============== IDF ===============\n");
    for (int i = 0; i < unique_count; i++) {
        printf("%-12s %.4lf\n", unique[i], compute_idf(unique[i]));
    }


    printf("\n============= TF-IDF ==============\n");
    printf("%-12s", "");
    for (int j = 0; j < num_docs; j++) printf("\tdoc%d", j + 1);
    printf("\n");
    for (int i = 0; i < unique_count; i++) {
        printf("%-12s", unique[i]);
        double idf = compute_idf(unique[i]);
        for (int j = 0; j < num_docs; j++) {
            printf("\t%.4lf", compute_tf(unique[i], j) * idf);
        }
        printf("\n");
    }
}


void help() {
    printf("\nAvailable commands:\n");
    printf("%-10s: Input documents\n", "set");
    printf("%-10s: Normalize, tokenize, remove stop-words, stem\n", "preprocess");
    printf("%-10s: Compute Term Frequency for a word\n", "tf");
    printf("%-10s: Compute Inverse Document Frequency for a word\n", "idf");
    printf("%-10s: Compute TF-IDF for a word\n", "tfidf");
    printf("%-10s: Display TF, IDF, TF-IDF for all tokens\n", "stat");
    printf("%-10s: Show commands\n", "help");
    printf("%-10s: Exit the program\n", "exit");
}
Search This Blog

Naimul's Academy Trace

offline 3

Comments

Post a Comment

Popular posts from this blog

CSE 101 Archive 1-1

Phy 129 - WM, CT-03