Detect HTML Tags

Sort by

recency

|

153 Discussions

|

  • + 0 comments

    C

    int cmp_str(const void *a, const void *b) { char *const *sa = a; char *const *sb = b; return strcmp(*sa, *sb); }

    int main() {

    const char *pattern = "<[:blank:]+?[/]?[A-Aa-z0-9]+[:blank:]+?";  // only digits
    const char *pattern_post = "[A-Za-z0-9]+";
    char buffer[MAX_INPUT];
    regex_t regex, regex_post;
    int result;
    char error_msg[128];
    
    // --- Get input from stdin---
    {
        size_t len = fread(buffer, 1, sizeof(buffer) - 1, stdin);
        buffer[len] = '\0';  // null-terminate
    }
    
    // --- Compile regex ---
    result = regcomp(&regex, pattern, REG_EXTENDED);
    if (result != 0) {
        regerror(result, &regex, error_msg, sizeof(error_msg));
        printf("Regex compile error: %s\n", error_msg);
        return 1;
    }
    
    result = regcomp(&regex_post, pattern_post, REG_EXTENDED);
    if (result != 0) {
        regerror(result, &regex, error_msg, sizeof(error_msg));
        printf("Regex compile error: %s\n", error_msg);
        return 1;
    }
    
    // --- Execute regex ---
    // Loop until no more matches
    const char *ptr = buffer;
    regmatch_t match[1];  // store one match at a time
    regmatch_t match_post[1];  // store one match at a time
    char hit[MAX_OUTPUT];
    char match_out[MAX_OUTPUT];
    memset(match_out,0,sizeof(match_out));
    memset(hit,0,sizeof(hit));
    
    while (regexec(&regex, ptr, 1, match, 0) == 0) {
    
        int start = match[0].rm_so;
        int end   = match[0].rm_eo;
        int len   = end - start;
    
        memset(hit,0,sizeof(hit));
        memcpy(hit,ptr+start,len);
    
        if(regexec(&regex_post, hit, 1, match_post, 0) == 0){
            int start_post = match_post[0].rm_so;
            int end_post   = match_post[0].rm_eo;
            int len_post   = end - start;
            strncat(match_out,hit+start_post,len_post);
            strcat(match_out,";");
        }
        // Move the pointer forward to search next part
        ptr += end;
    }
    
    //sort
    {
        char *tokens[100];  // store pointers to words
        int count = 0;
    
        // Split string by ';'
        char *token = strtok(match_out, ";");
        while (token != NULL) {
            tokens[count++] = token;
            token = strtok(NULL, ";");
        }
    
        // Sort words
        qsort(tokens, count, sizeof(char *), cmp_str);
    
        // Print back in sorted order, separated by ';'
        for (int i = 0; i < count; i++) {
            if (i == 0 || strcmp(tokens[i], tokens[i - 1]) != 0) {
                if(i!= 0){printf(";");}
                printf("%s", tokens[i]);
            }
        }
        printf("\n");
    }
    
    // --- Free compiled regex ---
    regfree(&regex);
    return 0;
    

    }

  • + 0 comments
    from html.parser import HTMLParser
    s=set()
    class myhtml(HTMLParser):
        def handle_starttag(self,tag,attrs):
            s.add(tag)
    html=""
    for _ in range(int(input())):
        html+=input()
    parser=myhtml()
    parser.feed(html)
    print(";".join(sorted(s)))
    
  • + 0 comments

    For Python 3:

    import re
    tag_pattern = re.compile(r'<\s*(?!/|!)([A-Za-z][\w\-]*)',re.ASCII)
    
    number_of_lines = int(input())
    unique_tag_names = set()
    for _ in range(number_of_lines):
        html_line = input()
        unique_tag_names.update(tag_pattern.findall(html_line))
        
    print(";".join(sorted(unique_tag_names)))
    
  • + 0 comments

    import re T = int(input())

    pattern = r'\<([A-Za-z0-9]{1,})|\<\s+([A-Za-z0-9]{1,})' tag_set = set()

    while T>0: line = input() pattern_found = re.findall(pattern, line) for p in pattern_found: tag_set.add(p[0]) tag_set.add(p[1]) T-=1

    print(";".join(sorted([tag for tag in tag_set if len(tag) > 0 ])))

  • + 0 comments

    Python:

    import re; html = "\n".join([input() for _ in range(int(input()))]); print(";".join(sorted(set(re.findall(r"<\s?([a-zA-Z]+[0-9A-Za-z])[^>]>", html)))));