Detect HTML Attributes

Sort by

recency

|

105 Discussions

|

  • + 0 comments

    The exact code will be: import re

    Read input

    N = int(raw_input()) html = '' for _ in range(N): html += raw_input()

    Extract tags and attributes using regex

    tags = {} pattern = r'<(\w+)([^>]*)>' matches = re.findall(pattern, html)

    for match in matches: tag, attrs = match if tag not in tags: tags[tag] = set()

    attr_pattern = r'(\w+)='
    attrs_match = re.findall(attr_pattern, attrs)
    tags[tag].update(attrs_match)
    

    Print output

    for tag in sorted(tags): if tag == 'a': attrs = ','.join(sorted([attr for attr in tags[tag] if attr in ['accesskey', 'href', 'title']])) else: attrs = ','.join(sorted(tags[tag]))

    if attrs:
        print("%s:%s" % (tag, attrs))
    else:
        print("%s:" % tag)
    
    
    
    
    
    
    
  • + 8 comments

    Java 15

    TreeMap holds tag and corresponding attributes as list

    Regex for tag :

    • Means "open_tag + tag_name + (anything before finding close_tag) +close_tag".
    • Here tag_name consists of word chars only.

    Regex for attribute :

    • Means : (space)attribute_name="attr_value"
    • We search for the tag attributes within the full_tag only.
    • attr_values can be enclosed within single or double quotes.
    import java.io.*;
    import java.util.*;
    import java.util.regex.Pattern;
    import java.util.regex.Matcher;
    public class Solution {
        public static void main(String[] args) {
            Scanner scanner = new Scanner(System.in);
            int n = Integer.parseInt(scanner.nextLine());
            Pattern tag_pattern = Pattern.compile("<(\\w+).*?>");
            Pattern attr_pattern = Pattern.compile("\\s(.*?)=[\"'].*?[\"']");
            TreeMap<String,TreeSet> tag_treemap = new TreeMap<String,TreeSet>(); 
            for(int i=0;i<n;i++){   
                Matcher tag_matcher = tag_pattern.matcher(scanner.nextLine());
                while(tag_matcher.find()){ 
                    String full_tag = tag_matcher.group(0);
                    String tag_name = tag_matcher.group(1);
                    boolean attr_found=false;
                    Matcher attr_matcher = attr_pattern.matcher(full_tag); 
                    TreeSet<String> attr_treeset;
                    if (tag_treemap.containsKey(tag_name))  
                        attr_treeset=tag_treemap.get(tag_name);
                    else
                        attr_treeset = new TreeSet<String>();
                    while(attr_matcher.find()){
                        String attr_name = attr_matcher.group(1);
                        attr_treeset.add(attr_name);
                        attr_found=true;    
                    }
                    tag_treemap.put(tag_name,attr_treeset);
                }
            }
            for(String key: tag_treemap.keySet()){   
                System.out.print(key+":");
                System.out.println(String.join(",",tag_treemap.get(key)));
            }
        }
    }
    
  • + 0 comments

    Python 3

    import re
    import sys
    n = int(input())
    html = sys.stdin.read()
    pattern_tag = r'<(\w+)(\s\w+=["\'].*?["\'])?\s?/?>'
    matches = re.findall(pattern_tag, html)
    dict_tag = {}
    for match in matches:
        pattern_attribute = r'\s(\w+)='
        atts = re.findall(pattern_attribute, match[1])
        if match[0] in dict_tag:
            dict_tag[match[0]].update(atts)
        else:
            dict_tag[match[0]] = set(atts)
    dict_tag = dict(sorted(dict_tag.items()))
    for k, v in dict_tag.items():
        print(k,":", sep="", end="")
        atts = sorted(list(v))
        print(",".join(atts))
    
  • + 0 comments
    import re
    import sys
    
    tag_pattern = re.compile(r'(<([^/\s>]+).*?/?>)')
    att_pattern = re.compile(r'(([a-z]+)=[\'"])')
    tags = {}
    
    content = sys.stdin.read()
    tag_matches = tag_pattern.findall(content)
    
    for tag_match in tag_matches:
        if tag_match[1] not in tags:
            tags[tag_match[1]] = set()
            
        att_matches = att_pattern.findall(tag_match[0])
        for att_match in att_matches:
            tags[tag_match[1]].add(att_match[1])
            
    for tag in sorted(tags.keys()):
        atts = ','.join(sorted(tags[tag]))
        print(f'{tag}:{atts}')
    
  • + 0 comments

    Testcase fails.

    Copy testcase STDIN to custom input case box.

    TEST SUCCESSFUL.

    WHAT GIVES!!!

    Not the prettiest code, but it definitely worked:

    import re
    
    n = int(input())
    
    tag_dict = dict()
    RE = re.compile(r'<\w+|\w+=')
    for _ in range(n):
        html = RE.findall(input())
        for i, tag in enumerate(html):
            if '<' in tag:
                key = tag[1:]
                tag_dict[key] = []
                for attr in html[i:]:
                    if '<' in attr and attr is not tag:
                        break
                    elif '=' in attr:
                        tag_dict[key].append(attr)
      
    for key in sorted(tag_dict.keys()):
            print(f'{key}:{",".join(sorted([attr for attr in tag_dict[key]]))}')