Structuring the Document

Sort by

recency

|

124 Discussions

|

  • + 0 comments

    Here’s a concise answer with the C code for parsing a document into paragraphs, sentences, and words, and handling queries to retrieve specific parts of the document: for more info to solve problems like this visit our site.

    include

    include

    include

    define MAX_PARAGRAPHS 100

    define MAX_SENTENCES 100

    define MAX_WORDS 100

    // Structure Definitions struct word { char* data; };

    struct sentence { struct word* data; int word_count; };

    struct paragraph { struct sentence* data; int sentence_count; };

    struct document { struct paragraph* data; int paragraph_count; };

    struct document Doc;

    // Initialize document void initialize_document(int n) { Doc.paragraph_count = n; Doc.data = (struct paragraph*)malloc(n * sizeof(struct paragraph)); }

    // Parse document text void parse_document(char* text) { char* para_text = strtok(text, "\n"); int para_index = 0;

    while (para_text != NULL) {
        struct paragraph* para = &Doc.data[para_index++];
        para->sentence_count = 0;
        para->data = (struct sentence*)malloc(MAX_SENTENCES * sizeof(struct sentence));
    
        char* sent_text = strtok(para_text, ".");
        int sent_index = 0;
    
        while (sent_text != NULL) {
            struct sentence* sen = &para->data[sent_index++];
            sen->word_count = 0;
            sen->data = (struct word*)malloc(MAX_WORDS * sizeof(struct word));
    
            char* word_text = strtok(sent_text, " ");
            int word_index = 0;
            while (word_text != NULL) {
                sen->data[word_index].data = (char*)malloc((strlen(word_text) + 1) * sizeof(char));
                strcpy(sen->data[word_index].data, word_text);
                sen->word_count++;
                word_text = strtok(NULL, " ");
                word_index++;
            }
            para->sentence_count++;
            sent_text = strtok(NULL, ".");
        }
    
        para_text = strtok(NULL, "\n");
    }
    

    }

    // Query Functions void get_paragraph(int p) { for (int i = 0; i < Doc.data[p - 1].sentence_count; i++) { for (int j = 0; j < Doc.data[p - 1].data[i].word_count; j++) { printf("%s ", Doc.data[p - 1].data[i].data[j].data); } printf(". "); } printf("\n"); }

    void get_sentence(int p, int s) { for (int i = 0; i < Doc.data[p - 1].data[s - 1].word_count; i++) { printf("%s ", Doc.data[p - 1].data[s - 1].data[i].data); } printf("\n"); }

    void get_word(int p, int s, int w) { printf("%s\n", Doc.data[p - 1].data[s - 1].data[w - 1].data); }

    // Main function to process input int main() { int n; scanf("%d\n", &n);

    char document_text[1000] = "";
    for (int i = 0; i < n; i++) {
        char temp[1000];
        fgets(temp, 1000, stdin);
        strcat(document_text, temp);
    }
    
    initialize_document(n);
    parse_document(document_text);
    
    int q;
    scanf("%d\n", &q);
    for (int i = 0; i < q; i++) {
        int type, p, s, w;
        scanf("%d", &type);
    
        if (type == 1) {
            scanf("%d", &p);
            get_paragraph(p);
        } else if (type == 2) {
            scanf("%d %d", &p, &s);
            get_sentence(p, s);
        } else if (type == 3) {
            scanf("%d %d %d", &p, &s, &w);
            get_word(p, s, w);
        }
    }
    
    return 0;
    

    }

  • + 0 comments
    struct document get_document(char* text)
    {
        struct document doc;
        struct paragraph *cur_paragraph = NULL;
        struct sentence *cur_sentence = NULL;
        char *new_word = NULL;
    
        doc.data = NULL;
        doc.paragraph_count = 0;
    
        for (char *s = text; *s; ++s)
        {
            if (*s == ' ' || *s == '.')
            {
                // nouveau paragraphe
                if (cur_paragraph == NULL)
                {
                    doc.paragraph_count++;
                    doc.data = (struct paragraph *) realloc(doc.data, sizeof(struct paragraph) * doc.paragraph_count);
    
                    cur_paragraph = doc.data + doc.paragraph_count - 1;
                    cur_paragraph->data = NULL;
                    cur_paragraph->sentence_count = 0;
    
                    cur_sentence = NULL;        // on recommence de facto une phrase
                }
    
                // nouvelle phrase
                if (cur_sentence == NULL)
                {
                    cur_paragraph->sentence_count++;
                    cur_paragraph->data = (struct sentence *) realloc(cur_paragraph->data, sizeof(struct sentence) * cur_paragraph->sentence_count);
    
                    cur_sentence = cur_paragraph->data + cur_paragraph->sentence_count - 1;
                    cur_sentence->data = NULL;
                    cur_sentence->word_count = 0;
                }
    
                // nouveau mot
                cur_sentence->word_count++;
                cur_sentence->data = (struct word *) realloc(cur_sentence->data, sizeof(struct word) * cur_sentence->word_count);
                cur_sentence->data[cur_sentence->word_count - 1].data = new_word;
                new_word = NULL;
    
                if (*s == '.')
                    cur_sentence = NULL;        // on recommencera une phrase
                *s = 0;
            }
    
            else if (*s == '\n')
            {
                cur_sentence = NULL;
                cur_paragraph = NULL;
            }
            else
            {
                if (new_word == NULL)
                {
                    new_word = s;
                }
            }
        }
    
        return doc;
    }
    
    struct word kth_word_in_mth_sentence_of_nth_paragraph(struct document Doc, int k, int m, int n)
    {
        return Doc.data[n - 1].data[m - 1].data[k - 1];
    }
    
    struct sentence kth_sentence_in_mth_paragraph(struct document Doc, int k, int m)
    {
        return Doc.data[m - 1].data[k - 1];
    }
    
    struct paragraph kth_paragraph(struct document Doc, int k)
    {
        return Doc.data[k - 1];
    }
    
  • + 0 comments
    #define PARAGRAPH_COUNT     d.paragraph_count
    #define SENTENCE_COUNT      d.data[PARAGRAPH_COUNT].sentence_count
    #define WORD_COUNT          d.data[PARAGRAPH_COUNT].data[SENTENCE_COUNT].word_count
    
    #define PARAGRAPH_DATA      d.data[PARAGRAPH_COUNT].data
    #define SENTENCE_DATA       PARAGRAPH_DATA[SENTENCE_COUNT].data
    #define WORD_DATA           SENTENCE_DATA[WORD_COUNT].data
    
    struct document get_document(char* text) {
    
        struct document d = {0};
        d.data = calloc(MAX_PARAGRAPHS , sizeof(struct paragraph));
        d.data[0].data = calloc(((MAX_CHARACTERS/4)/2) , sizeof(struct sentence));
        d.data[0].data[0].data = calloc((MAX_CHARACTERS/4) , sizeof(struct word));
        
        char * ptr = text;
        int s_index = 0 , current_index = 0;
        
        while (*ptr != '\0') 
        {
            if( *ptr == ' ')
            {
                *ptr = '\0';
                if(strlen(text+s_index) > 0)
                {
                    WORD_DATA = calloc(strlen(text+s_index) , sizeof(char));
                    WORD_DATA = (text+s_index);
                    WORD_COUNT++;
                }
                s_index = current_index+1;
            }  
    
            else if (*ptr == '.') {
                *ptr = '\0';
                if(strlen(text+s_index) > 0)
                {
                    WORD_DATA = calloc(strlen(text+s_index) , sizeof(char));
                    WORD_DATA = (text+s_index);
                    WORD_COUNT++;
                    
                    SENTENCE_COUNT++;
                    SENTENCE_DATA = calloc(((MAX_CHARACTERS/4)/2) , sizeof(struct word));                
                }
                s_index = current_index+1;    
            }
            else if (*ptr == '\n') {
                *ptr = '\0';
    
                PARAGRAPH_COUNT++;
                PARAGRAPH_DATA = calloc(MAX_PARAGRAPHS , sizeof(struct sentence));
                SENTENCE_DATA = calloc(((MAX_CHARACTERS/4)/2) , sizeof(struct word));
                s_index = current_index+1;            
            }
            
            ptr++;
            current_index++;
        }
    
        PARAGRAPH_COUNT++;
        return d;
    }
    
    struct word kth_word_in_mth_sentence_of_nth_paragraph(struct document Doc, int k, int m, int n) {
        return Doc.data[n-1].data[m-1].data[k-1];
    }
    
    struct sentence kth_sentence_in_mth_paragraph(struct document Doc, int k, int m) { 
        return Doc.data[m-1].data[k-1];
    }
    
    struct paragraph kth_paragraph(struct document Doc, int k) {
        return Doc.data[k-1];
    }
    
  • + 0 comments

    .c

    #define DEFAULT_LEN 8
    
    /// @brief Helper function to insert a character into a word, resizing the word as needed
    /// @param word pointer to pointer to char
    /// @param word_len pointer to int
    /// @param ch the character
    void insert_char(char** word, int* word_len, char ch) {
        /* These lines are important, also the '()' more important */
        (*word_len)++;
        (*word) = realloc(*word, sizeof(char) * (*word_len));
        (*word)[*word_len - 1] = ch;
    }
    
    /// @brief Function to check if a character is whitespace
    /// @param ch the character
    /// @return boolean
    int is_whitespace(char ch) {
        return (ch == ' ');
    }
    
    /// @brief Function to trim leading whitespace characters
    /// @param text pointer to char
    /// @param character pointer to int
    void trim_whitespace(char* text, int* character) {
        while (is_whitespace(text[*character])) {
            (*character)++;
        }
    }
    
    /// @brief Function to check if a character is a valid text character
    /// @param ch the character
    /// @return boolean
    int is_text(char ch) {
        return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
    }
    
    /// @brief Function to check if a character is a sentence terminator (period)
    /// @param ch the character
    /// @return boolean
    int is_sentence_terminator(char ch) {
        return (ch == '.');
    }
    
    /// @brief Function to check if a character is a paragraph terminator (newline or null terminator)
    /// @param ch the character
    /// @return boolean
    int is_paragraph_terminator(char ch) {
        return ((ch == '\n') || (ch == '\0'));
    }
    
    /// @brief Function to get the next character in the text and increment the index
    /// @param text pointer to char
    /// @param character pointer to int
    /// @return **char** - the character
    char next_character(char* text, int* character) {
        return (text[(*character)++]);
    }
    
    /// @brief Function to parse the next word from the text
    /// @param text pointer to char
    /// @param character pointer to int
    /// @return word struct - the word
    struct word next_word(char* text, int* character) {
        char* word = malloc(DEFAULT_LEN * sizeof(char));    // Allocate initial memory for the word
        int word_size = 0;                                  // Initialize word size
    
        // Loop through characters until a non-text character is encountered
        while (is_text(text[*character])) {
            char ch = next_character(text, character);      // Get the next character
            insert_char(&word, &word_size, ch);             // Insert the character into the word
        }
        
        insert_char(&word, &word_size, '\0');               // Null-terminate the word
    
        // Move word array to the data array of the word structure
        struct word W;
        W.data = word;
    
        return W;
    }
    
    /// @brief Function to parse the next sentence from the text
    /// @param text pointer to char
    /// @param character pointer to int
    /// @return sentence struct - the sentence
    struct sentence next_sentence(char* text, int* character) {
        struct sentence S;
        S.data = malloc(DEFAULT_LEN * sizeof(struct word));                     // Allocate initial memory for the sentence
        S.word_count = 0;                                                       // Initialize sentence length
        
        // Loop through characters until a sentence terminator is encountered
        while (!is_sentence_terminator(text[*character])) {
            trim_whitespace(text, character);                                   // Trim leading whitespace
            struct word W = next_word(text, character);                         // Parse the next word
    
            S.word_count++;
            S.data = realloc(S.data, S.word_count * sizeof(struct word));       // Resize the sentence array
            S.data[S.word_count - 1] = W;                                       // Add the word to the sentence
        }
        
        next_character(text, character);                                        // Move past the period
        return S;
    }
    
    /// @brief Function to parse the next paragraph from the text
    /// @param text pointer to char
    /// @param character pointer to int
    /// @return paragraph struct - the paragraph
    struct paragraph next_paragraph(char* text, int* character) {
        struct paragraph P;
        P.data = malloc(DEFAULT_LEN * sizeof(struct sentence));                     // Allocate initial memory for the paragraph
        P.sentence_count = 0;                                                       // Initialize paragraph length
        
        // Loop through characters until a paragraph terminator is encountered
        while (!is_paragraph_terminator(text[*character])) {
            struct sentence S = next_sentence(text, character);                     // Parse the next sentence
    
            P.sentence_count++;
            P.data = realloc(P.data, P.sentence_count * sizeof(struct sentence));   // Resize the paragraph array
            P.data[P.sentence_count - 1] = S;                                       // Add the sentence to the paragraph
        }
        
        return P;
    }
    
    /// @brief Function to parse the entire document from the text
    /// @param text pointer to char
    /// @return document struct - the document
    struct document get_document(char* text) {
        struct document D;
        D.data = malloc(DEFAULT_LEN * sizeof(struct paragraph));                    // Allocate initial memory for the document
        D.paragraph_count = 0;                                                      // Initialize document length
        int character = 0;                                                          // Initialize character index
        int last_paragraph = 0;                                                     // Flag to check if the last paragraph has been reached
        
        // Loop until the last paragraph is reached
        while (!last_paragraph) {
            struct paragraph P = next_paragraph(text, &character);                  // Parse the next paragraph
            last_paragraph = next_character(text, &character) == '\0';              // Check if the last character is reached
    
            D.paragraph_count++;
            D.data = realloc(D.data, D.paragraph_count * sizeof(struct paragraph)); // Resize the document array
            D.data[D.paragraph_count - 1] = P;                                      // Add the paragraph to the document
        }
        
        return D;
    }
    
    /// @brief Return the k-th word in the m-th sentence of the n-th paragraph
    /// @param document the document
    /// @param k the words
    /// @param m the sentences
    /// @param n the paragraphs
    /// @return word struct
    struct word kth_word_in_mth_sentence_of_nth_paragraph(struct document Doc, int k, int m, int n) {
        return Doc.data[n - 1].data[m - 1].data[k - 1];
    }
    
    /// @brief Return the k-th sentence in the m-th paragraph
    /// @param document the document
    /// @param k the sentences
    /// @param m the paragraphs
    /// @return sentence struct
    struct sentence kth_sentence_in_mth_paragraph(struct document Doc, int k, int m) { 
        return Doc.data[m - 1].data[k - 1];
    }
    
    /// @brief Return the k-th paragraph
    /// @param document the document
    /// @param k the paragraphs
    /// @return paragraph struct
    struct paragraph kth_paragraph(struct document Doc, int k) {
        return Doc.data[k - 1];
    }
    
  • + 0 comments

    not the prettiest code i've ever written

    void append_word(struct word *tmp_word, int *letter_count, char ch){
        tmp_word->data = realloc(tmp_word->data, 
            sizeof(char)*(*letter_count+1));
        tmp_word->data[*letter_count] = ch;
        *letter_count += 1;
    }
    
    void reset_word(struct word *tmp_word, int *letter_count){
        tmp_word->data = malloc(sizeof(char));
        *letter_count = 0;
    }
    
    void append_sentence(struct sentence *tmp_sentence, struct word *tmp_word){
        tmp_sentence->data = realloc(tmp_sentence->data, 
            sizeof(struct word)*(tmp_sentence->word_count+1));
        tmp_sentence->data[tmp_sentence->word_count] = *tmp_word;
        tmp_sentence->word_count++;
    }
    
    void reset_sentence(struct sentence *tmp_sentence){
        tmp_sentence->data = malloc(sizeof(struct word));
        tmp_sentence->word_count = 0;
    }
    
    void append_paragraph(struct paragraph *tmp_paragraph, struct sentence *tmp_sentence){
        tmp_paragraph->data = realloc(tmp_paragraph->data, 
            sizeof(struct sentence)*(tmp_paragraph->sentence_count+1));
        tmp_paragraph->data[tmp_paragraph->sentence_count] = *tmp_sentence;
        tmp_paragraph->sentence_count++;
    }
    
    void reset_paragraph(struct paragraph *tmp_paragraph){
        tmp_paragraph->data = malloc(sizeof(struct sentence));
        tmp_paragraph->sentence_count = 0;
    }
    
    void append_document(struct document *tmp_document, struct paragraph *tmp_paragraph){
        tmp_document->data = realloc(tmp_document->data, 
            sizeof(struct paragraph)*(tmp_document->paragraph_count+1));
        tmp_document->data[tmp_document->paragraph_count] = *tmp_paragraph;
        tmp_document->paragraph_count++;
    }
    
    struct document get_document(char* text) {
        //get length of text
        int text_length = strlen(text);
        
        struct word *tmp_word;
        int letters;
        tmp_word = malloc(sizeof(struct word));
        reset_word(tmp_word, &letters);
        
        struct sentence *tmp_sentence;
        tmp_sentence = malloc(sizeof(struct sentence));
        reset_sentence(tmp_sentence);
        
        struct paragraph *tmp_paragraph;
        tmp_paragraph = malloc(sizeof(struct paragraph));
        reset_paragraph(tmp_paragraph);
        
        struct document *tmp_document;
        tmp_document = malloc(sizeof(struct document));
        tmp_document->paragraph_count = 0;
        tmp_document->data = malloc(sizeof(struct sentence)); //paragraph[0]
        
        
        //iterate over the text
        for(int i=0; i<text_length; i++){ //condense by i<strlen(text)
            char ch = text[i];
            if((ch >= 'a' && ch <= 'z') || 
                (ch >= 'A' && ch <= 'Z')){
                //found letter
                
                //put into word
                append_word(tmp_word, &letters, ch);
            } else {
                //found end of word   
                if(letters>0){     
                    //terminate word string
                    tmp_word->data[letters] = '\0'; 
                    //put word into current sentence     
                    append_sentence(tmp_sentence, tmp_word);           
                    //clean tmp word
                    reset_word(tmp_word, &letters);
                }
    
                //check for end of sentence or paragraph
                if (ch == ' '){ //is word
                
                } else if (ch == '\n'){ 
                    //found end of paragraph
                    //put into document
                    append_document(tmp_document, tmp_paragraph);
                    reset_paragraph(tmp_paragraph);
                }
                else { 
                    //found end of sentence
                    //put into current paragraph
                    append_paragraph(tmp_paragraph, tmp_sentence);
                    reset_sentence(tmp_sentence);
                }
            }
        }
        //end of document
        append_document(tmp_document, tmp_paragraph);
        
        return *tmp_document;
    }
    
    struct paragraph kth_paragraph(struct document Doc, int k) {
        return Doc.data[k-1];
    }
    
    struct sentence kth_sentence_in_mth_paragraph(struct document Doc, int k, int m) { 
        return Doc.data[m-1].data[k-1];
    }
    
    struct word kth_word_in_mth_sentence_of_nth_paragraph(struct document Doc, int k, int m, int n) {
        return Doc.data[n-1].data[m-1].data[k-1];
    }