Build a Stack Exchange Scraper

  • + 0 comments

    Java 15

    import java.io.*;
    import java.util.*;
    import java.util.regex.Pattern;
    import java.util.regex.Matcher;
    public class Solution {
        public static void main(String[] args) {
            //read all input to 'markup'  without newline character
            Scanner scanner = new Scanner(System.in);
            String markup="";
            while (scanner.hasNextLine())   markup=markup+scanner.nextLine();
             
            // Pattern pattern = Pattern.compile("href=\"/questions/(\\d+)/.*?class=\"question-hyperlink\">(.*?)<.*?class=\"relativetime\">(.*?)</span>"); //works GOOD
            Pattern pattern = Pattern.compile("question-summary-(\\d+).*?class=\"question-hyperlink\">(.*?)<.*?class=\"relativetime\">(.*?)</span>");
            Matcher matcher = pattern.matcher(markup);
            while(matcher.find())
                System.out.println(String.join(";",matcher.group(1),matcher.group(2),matcher.group(3)));
            
        }
    }
    
    /*
    (.*?) matches any character (.) any number of times (*), as few times as possible to make the regex match (?). You'll get a match on any string, but you'll only capture a blank string because of the question mark. This feature is much more useful when you have a more complicated regex. Here, the parser doesn't have to capture anything at all to get a match: the asterisk allows any number of characters in the capturing group, while the question mark makes the parser save as many as possible from the input text for later, resulting in nothing being captured.
    
    (.*)? captures a group zero or one times (?). That group consists of a run of any length (*) of any character (.). This also will match anything, but it will capture the first line, since the dot matches anything except a newline.
    
    \" represents double quotes
    */