Detect HTML Attributes

  • + 0 comments
    import re
    import sys
    
    tag_pattern = re.compile(r'(<([^/\s>]+).*?/?>)')
    att_pattern = re.compile(r'(([a-z]+)=[\'"])')
    tags = {}
    
    content = sys.stdin.read()
    tag_matches = tag_pattern.findall(content)
    
    for tag_match in tag_matches:
        if tag_match[1] not in tags:
            tags[tag_match[1]] = set()
            
        att_matches = att_pattern.findall(tag_match[0])
        for att_match in att_matches:
            tags[tag_match[1]].add(att_match[1])
            
    for tag in sorted(tags.keys()):
        atts = ','.join(sorted(tags[tag]))
        print(f'{tag}:{atts}')