Discussion on Similarity Scores Challenge

6 years ago+ 0 comments

corpus = [
    "I'd like an apple.",
    "An apple a day keeps the doctor away.",
    "Never compare an apple to an orange.",
    "I prefer scikit-learn to orange.",
]

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer()
sparse = vectorizer.fit_transform(corpus)

# Dot product similiarities, the sentences are short!
x = sparse * sparse.T

# Skip the first document to itself and add two since documents are 1-indexed, answer: 3 (0.39ish)
print("{0}".format(np.argmax(x[0, 1:].toarray()) + 2))

Cookie support is required to access HackerRank