Discussion on Build a Stack Exchange Scraper Challenge

1 year ago+ 0 comments
Java 15
import java.io.*;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
public class Solution {
    public static void main(String[] args) {
        //read all input to 'markup'  without newline character
        Scanner scanner = new Scanner(System.in);
        String markup="";
        while (scanner.hasNextLine())   markup=markup+scanner.nextLine();
         
        // Pattern pattern = Pattern.compile("href=\"/questions/(\\d+)/.*?class=\"question-hyperlink\">(.*?)<.*?class=\"relativetime\">(.*?)</span>"); //works GOOD
        Pattern pattern = Pattern.compile("question-summary-(\\d+).*?class=\"question-hyperlink\">(.*?)<.*?class=\"relativetime\">(.*?)</span>");
        Matcher matcher = pattern.matcher(markup);
        while(matcher.find())
            System.out.println(String.join(";",matcher.group(1),matcher.group(2),matcher.group(3)));
        
    }
}

/*
(.*?) matches any character (.) any number of times (*), as few times as possible to make the regex match (?). You'll get a match on any string, but you'll only capture a blank string because of the question mark. This feature is much more useful when you have a more complicated regex. Here, the parser doesn't have to capture anything at all to get a match: the asterisk allows any number of characters in the capturing group, while the question mark makes the parser save as many as possible from the input text for later, resulting in nothing being captured.

(.*)? captures a group zero or one times (?). That group consists of a run of any length (*) of any character (.). This also will match anything, but it will capture the first line, since the dot matches anything except a newline.

\" represents double quotes
*/
Cookie support is required to access HackerRank