Source code for find_similar.core

"""
Core module with search functions
"""

from .calc_functions import (
    TokenText,
    calc_cosine_similarity_opt,
    calc_keywords_rating,
    sort_search_list,
)
from .tokenize import tokenize


# TODO: pylint said too many arguments here. And it's true. We should think about this problem
[docs] def find_similar( # pylint: disable=too-many-arguments text_to_check, texts, language="russian", count=5, dictionary=None, remove_stopwords=True, keywords=None, ) -> list[TokenText]: """ The main function to search similar texts. :param text_to_check: Text to find similars :param texts: List of str or TokenText. In these texts we will search similars :param language: Language, default='russian' :param count: Results count :param dictionary: default = None. If you want to replace one words to others :param keywords: default = None. :param remove_stopwords: default = True. Remove or not stopwords :return: Result list sorted by similarity percent (cos) """ if isinstance(text_to_check, TokenText): text_to_check_tokens = text_to_check.tokens else: text_to_check_tokens = tokenize( text_to_check, language, dictionary, remove_stopwords ) token_texts = [] for text in texts: if not isinstance(text, TokenText): text = TokenText(text, dictionary=dictionary, language=language, remove_stopwords=remove_stopwords) cos = calc_cosine_similarity_opt(text.tokens, text_to_check_tokens) text.cos = cos if keywords: keywords_rating = calc_keywords_rating(text, keywords) text.key = keywords_rating token_texts.append(text) text_rated_sorted = sort_search_list(token_texts, keywords=keywords) return text_rated_sorted[:count]