• + 0 comments
    corpus = [
        "I'd like an apple.",
        "An apple a day keeps the doctor away.",
        "Never compare an apple to an orange.",
        "I prefer scikit-learn to orange.",
    ]
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    import numpy as np
    
    vectorizer = TfidfVectorizer()
    sparse = vectorizer.fit_transform(corpus)
    
    # Dot product similiarities, the sentences are short!
    x = sparse * sparse.T
    
    # Skip the first document to itself and add two since documents are 1-indexed, answer: 3 (0.39ish)
    print("{0}".format(np.argmax(x[0, 1:].toarray()) + 2))