NLP Text Similarity on python
Sathishkumar Nagarajan
Sathishkumar Nagarajan
Enterprise Architect @ TrueTech. | CISM, Cloud Architecture, Conversational AI, Enterprise Architect, Security Architecture, DevOps and Cloud Serivces
import sys # Define the documents doc_trump = "How do I make a Directory entry?" doc_election = "I make a Directory submit" #doc_putin = "i am feeling very very bad" documents = [doc_trump, doc_election] # Scikit Learn from sklearn.feature_extraction.text import CountVectorizer import pandas as pd # Create the Document Term Matrix count_vectorizer = CountVectorizer(stop_words='english') count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(documents) # OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies. doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), index=['doc_trump', 'doc_election']) df # Compute Cosine Similarity from sklearn.metrics.pairwise import cosine_similarity print(cosine_similarity(df, df))
Output:
[[1. 0.51639778]
[0.51639778 1. ]]