Replace an outdated FlashText library with a custom implementation

spyker77 · Dec 8, 2023 · fff3892 · fff3892
1 parent 6abf830
commit fff3892
Show file tree

Hide file tree

Showing 9 changed files with 256 additions and 32 deletions.
diff --git a/keyword_processor/README.md b/keyword_processor/README.md
@@ -0,0 +1,33 @@
+# Reworked FlashText Keyword Processor
+
+## Overview
+
+This module presents a reworked and optimized version of the [FlashText](https://pypi.org/project/flashtext/) library for Python. It aims to provide a modern, efficient, and easy-to-maintain implementation for keyword extraction in large texts.
+
+## Features
+
+- **Case Sensitivity**: Supports both case-sensitive and case-insensitive keyword processing.
+- **Efficient Extraction**: Offers efficient extraction of keywords from large text datasets.
+- **Flexible Keyword Addition**: Allows adding keywords from various sources including files, dictionaries, or directly as strings.
+- **Optimized Trie Structure**: Utilizes a custom Trie data structure for optimized search performance.
+
+## Usage
+
+Here is a basic example of how to use the reworked KeywordProcessor:
+
+```python
+from keyword_processor import KeywordProcessor
+
+# Initialize the processor with case sensitivity preference
+processor = KeywordProcessor(case_sensitive=False)
+
+# Add keywords and their respective clean names
+processor.add_keyword("Python", "Python Programming")
+processor.add_keyword("Java", "Java Programming")
+
+# Extract keywords from text
+text = "I love Python and Java programming languages."
+extracted_keywords = processor.extract_keywords(text)
+
+print(extracted_keywords)  # Output: ['Python Programming', 'Java Programming']
+```
diff --git a/keyword_processor/__init__.py b/keyword_processor/__init__.py
diff --git a/keyword_processor/processor.py b/keyword_processor/processor.py
@@ -0,0 +1,147 @@
+from collections import defaultdict
+from pathlib import Path
+
+
+class TrieNode:
+    """A node in the Trie structure used for efficient keyword searching.
+
+    Attributes:
+        children (dict): A dictionary mapping characters to their corresponding TrieNode.
+        is_end_of_word (bool): Indicates if the node represents the end of a keyword.
+        clean_name (str | None): The associated clean name for the keyword represented by this node.
+    """
+
+    def __init__(self) -> None:
+        self.children: dict[str, TrieNode] = defaultdict(TrieNode)
+        self.is_end_of_word: bool = False
+        self.clean_name: str | None = None
+
+
+class KeywordProcessor:
+    """Processes and extracts keywords from text using a Trie structure.
+
+    This class provides methods to add keywords, normalize text, and extract keywords from sentences.
+
+    Attributes:
+        case_sensitive (bool): Determines if keyword matching is case-sensitive.
+        root (TrieNode): The root node of the Trie structure.
+        keyword_map (dict): A map from keywords to their clean names.
+
+    Args:
+        case_sensitive (bool): Specifies if the keyword matching should be case-sensitive. Defaults to False.
+    """
+
+    def __init__(self, case_sensitive: bool = False) -> None:
+        self.case_sensitive: bool = case_sensitive
+        self.root: TrieNode = TrieNode()
+        self.keyword_map: dict[str, str] = {}
+
+    def _normalize(self, text: str) -> str:
+        """
+        Normalizes the text based on case sensitivity.
+
+        Args:
+            text (str): The text to normalize.
+
+        Returns:
+            str: Normalized text.
+        """
+        return text if self.case_sensitive else text.lower()
+
+    def add_keyword(self, keyword: str, clean_name: str | None = None) -> None:
+        """
+        Adds a keyword to the trie and keyword map.
+
+        Args:
+            keyword (str): The keyword to add.
+            clean_name (str | None): The clean name associated with the keyword. Defaults to None.
+        """
+        clean_name = clean_name or keyword
+        keyword = self._normalize(keyword)
+        self.keyword_map[keyword] = clean_name
+
+        node = self.root
+        for char in keyword:
+            node = node.children[char]
+
+        node.is_end_of_word = True
+        node.clean_name = clean_name
+
+    def add_keyword_from_file(self, keyword_file: str, encoding: str = "utf-8") -> None:
+        """
+        Adds keywords from a file.
+
+        Args:
+            keyword_file (str): Path to the file containing keywords.
+            encoding (str): The encoding of the file. Defaults to "utf-8".
+        """
+        file_path = Path(keyword_file)
+
+        if not file_path.is_file():
+            raise OSError(f"Invalid file path {keyword_file}")
+
+        with file_path.open(encoding=encoding) as file:
+            for line in file:
+                parts = line.split("=>")
+                keyword = parts[0].strip()
+                clean_name = parts[1].strip() if len(parts) > 1 and parts[1] else None
+                self.add_keyword(keyword, clean_name)
+
+    def add_keywords_from_dict(self, keyword_dict: dict[str, list[str]]) -> None:
+        """
+        Adds multiple keywords from a dictionary.
+
+        Args:
+            keyword_dict (dict[str, list[str]]): Dictionary where each key is a clean name and associated value is a list of keywords.
+        """
+        for clean_name, keywords in keyword_dict.items():
+            for keyword in keywords:
+                self.add_keyword(keyword, clean_name)
+
+    def _is_word_boundary(self, sentence: str, start_idx: int, end_idx: int) -> bool:
+        """
+        Checks if a given index is a word boundary in the sentence.
+
+        Args:
+            sentence (str): The sentence to check within.
+            start_idx (int): The starting index of the word.
+            end_idx (int): The ending index of the word.
+
+        Returns:
+            bool: True if the indices represent a word boundary, False otherwise.
+        """
+        if start_idx > 0 and sentence[start_idx - 1].isalnum():
+            return False
+        if end_idx < len(sentence) and sentence[end_idx].isalnum():
+            return False
+        return True
+
+    def extract_keywords(self, sentence: str, span_info: bool = False) -> list[str | tuple[str, int, int]]:
+        """
+        Extracts keywords from a sentence.
+
+        Args:
+            sentence (str): The sentence to extract keywords from.
+            span_info (bool): If True, returns a list of tuples with the keyword, start index, and end index. Defaults to False.
+
+        Returns:
+            list[str | tuple[str, int, int]]: A list of extracted keywords or tuples containing the keyword and its span in the sentence.
+        """
+        sentence = self._normalize(sentence)
+        results = []
+
+        for start_pos in range(len(sentence)):
+            node = self.root
+            for end_pos in range(start_pos, len(sentence)):
+                char = sentence[end_pos]
+                if char in node.children:
+                    node = node.children[char]
+                    if node.is_end_of_word and self._is_word_boundary(sentence, start_pos, end_pos + 1):
+                        clean_name = node.clean_name
+                        match = (clean_name, start_pos, end_pos + 1) if span_info else clean_name
+                        results.append(match)
+                        break
+                else:
+                    break
+
+        return results
diff --git a/keyword_processor/tests.py b/keyword_processor/tests.py
@@ -0,0 +1,55 @@
+import pytest
+
+from .processor import KeywordProcessor
+
+
+class TestKeywordProcessor:
+    @pytest.fixture
+    def processor(self):
+        return KeywordProcessor(case_sensitive=False)
+
+    def test_add_keyword(self, processor):
+        processor.add_keyword("Python", "Python Programming")
+        assert "python" in processor.keyword_map
+        assert processor.keyword_map["python"] == "Python Programming"
+
+    def test_case_sensitivity(self):
+        processor = KeywordProcessor(case_sensitive=True)
+        processor.add_keyword("Python", "Python Programming")
+        assert "Python" in processor.keyword_map
+        assert "python" not in processor.keyword_map
+
+    def test_extract_keywords(self, processor):
+        processor.add_keyword("python", "Python Programming")
+        processor.add_keyword("java", "Java Programming")
+        extracted_keywords = processor.extract_keywords("I love Python and Java.")
+        assert extracted_keywords == ["Python Programming", "Java Programming"]
+
+    def test_add_keywords_from_dict(self, processor):
+        keyword_dict = {"Programming": ["Python", "Java", "C++"]}
+        processor.add_keywords_from_dict(keyword_dict)
+        assert "python" in processor.keyword_map
+        assert "java" in processor.keyword_map
+        assert "c++" in processor.keyword_map
+        assert processor.keyword_map["python"] == "Programming"
+
+    def test_add_keyword_from_file(self, processor, tmpdir):
+        # Create a temporary keyword file
+        keyword_file = tmpdir.join("keywords.txt")
+        keyword_file.write("Python=>Python Programming\nJava=>Java Programming")
+        processor.add_keyword_from_file(str(keyword_file))
+        assert "python" in processor.keyword_map
+        assert "java" in processor.keyword_map
+        assert processor.keyword_map["python"] == "Python Programming"
+        assert processor.keyword_map["java"] == "Java Programming"
+
+    def test_add_keyword_from_file_with_invalid_path(self, processor):
+        invalid_path = "invalid/path/to/file.txt"
+        with pytest.raises(IOError) as excinfo:
+            processor.add_keyword_from_file(invalid_path)
+        assert str(excinfo.value) == f"Invalid file path {invalid_path}"
+
+    def test_extract_keywords_with_span_info(self, processor):
+        processor.add_keyword("python", "Python Programming")
+        extracted_keywords = processor.extract_keywords("I love Python.", span_info=True)
+        assert extracted_keywords == [("Python Programming", 7, 13)]
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,7 +54,6 @@ django-storages = "^1.14.2"
 djangorestframework = "^3.14.0"
 djangorestframework-simplejwt = "^5.3.1"
 drf-spectacular = "^0.26.5"
-flashtext = "^2.7"
 gunicorn = "^21.2.0"
 pypdf = "^3.17.1"
 python = "^3.12"

diff --git a/resume_analyzer/analyzer.py b/resume_analyzer/analyzer.py
@@ -4,9 +4,9 @@
 
 from django.core.cache import cache
 from django.core.files.uploadedfile import InMemoryUploadedFile
-from flashtext import KeywordProcessor
 from pypdf import PdfReader
 
+from keyword_processor import KeywordProcessor
 from scrapers.models import Skill, Vacancy
 
 
@@ -25,9 +25,8 @@ def find_skills_in_resume(text_from_resume: str) -> set[str]:
         cache.set("skills_from_db", skills_from_db, 12 * 60 * 60)
 
     keyword_processor = KeywordProcessor()
-    for clean_name, unclean_names in skills_from_db:
-        for unclean_name in unclean_names:
-            keyword_processor.add_keyword(unclean_name, clean_name)
+    dict_of_skills = {clean_name: unclean_names for clean_name, unclean_names in skills_from_db}
+    keyword_processor.add_keywords_from_dict(dict_of_skills)
 
     skills_from_resume = set(keyword_processor.extract_keywords(text_from_resume))
     return skills_from_resume

diff --git a/scrapers/management/shared/base_scraper.py b/scrapers/management/shared/base_scraper.py
@@ -3,11 +3,12 @@
 from collections import Counter
 from urllib.parse import urlencode
 
-from flashtext import KeywordProcessor
 from playwright.sync_api import ElementHandle, Page
 from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
 from tenacity import RetryError, Retrying, stop_after_attempt, wait_random_exponential
 
+from keyword_processor import KeywordProcessor
+
 from .utils import get_playwright_page
 
 logger = logging.getLogger("django")
@@ -178,7 +179,7 @@ def process_vacancy_content(self, vacancy_without_skills: dict[str, str], keywor
 
         Args:
             vacancy_without_skills (dict[str, str]): A dictionary containing vacancy details.
-            keyword_processor (KeywordProcessor): An instance of flashtext.KeywordProcessor.
+            keyword_processor (KeywordProcessor): An instance of KeywordProcessor.
 
         Returns:
             dict: A dictionary with vacancy details and extracted skills.

diff --git a/scrapers/management/shared/generic_command.py b/scrapers/management/shared/generic_command.py
@@ -3,8 +3,8 @@
 
 from django.core.management.base import BaseCommand
 from django.db import OperationalError
-from flashtext import KeywordProcessor
 
+from keyword_processor import KeywordProcessor
 from scrapers.models import Job, Skill, Vacancy
 
 from .base_scraper import BaseScraper