Source code for find_similar.calc_functions

"""
Calculation functions to find similarity percent
"""
from .tokenize import tokenize


[docs] def calc_cosine_similarity_opt(x_set: set, y_set: set) -> float: """ Get cos between two sets of words :param x_set: One set :param y_set: Another set :return: cos similarity """ intersection_length = len(y_set & x_set) cosine = 0 if intersection_length > 0: l1n_sum = len(x_set) l2n_sum = len(y_set) sum_l1_l2 = l1n_sum * l2n_sum cosine = intersection_length / float(sum_l1_l2**0.5) return cosine
[docs] class TokenText: """ The main type to work with text tokens """ def __init__(self, # pylint: disable=too-many-arguments text, tokens=None, dictionary=None, language='russian', remove_stopwords=True, **kwargs): """ init method :param text: simple text :param tokens: You can set already created tokens. Default = None :param dictionary: default = None. If you want to replace one words to others you can send the dictionary. :param remove_stopwords: default = True. :param language: default = russian. :param **kwargs: You can set any properties in the result object :return: cos similarity """ self.text = text self.key = 0 for k, val in kwargs.items(): setattr(self, k, val) self.tokens = tokens if tokens else get_tokens(text, dictionary=dictionary, language=language, remove_stopwords=remove_stopwords) def __eq__(self, other): """ compare two TokenText objects :param other: second TokenText objects (self - is the first) :return: True or False, depends on object id """ return self.text == other.text def __str__(self): return repr(self) def __repr__(self): cos_str = f', cos={self.cos}' if hasattr(self, 'cos') else '' return f'TokenText(text="{self.text}", len(tokens)={len(self.tokens)}{cos_str})'
[docs] def get_tokens(text, dictionary=None, language="russian", remove_stopwords=True) -> set: """ Get tokens from str text :param text: str text :param dictionary: default = None. If you want to replace one words to others you can send the dictionary :param language :param remove_stopwords :return: tokes for text """ tokens = tokenize(text, language, dictionary, remove_stopwords) return tokens
[docs] def calc_keywords_rating(text, keywords): """ Calc keywords rating :param keywords: Keywords """ rating = 0 for token in text.tokens: for k, v in keywords.items(): if str(k) == str(token): rating = rating + v return rating
[docs] def sort_search_list(token_texts, keywords=None): """ Sort search list :param token_texts: Texts with tokens :param keywords: Keywords, default None """ text_sorted_by_cos = sorted( token_texts, key=lambda item: item.cos, reverse=True ) if keywords: text_sorted_by_cos = sorted( token_texts, key=lambda item: (item.key, item.cos), reverse=True ) return text_sorted_by_cos