Skip to content

Commit

Permalink
Replace an outdated FlashText library with a custom implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
spyker77 committed Dec 8, 2023
1 parent 6abf830 commit fff3892
Show file tree
Hide file tree
Showing 9 changed files with 256 additions and 32 deletions.
33 changes: 33 additions & 0 deletions keyword_processor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Reworked FlashText Keyword Processor

## Overview

This module presents a reworked and optimized version of the [FlashText](https://pypi.org/project/flashtext/) library for Python. It aims to provide a modern, efficient, and easy-to-maintain implementation for keyword extraction in large texts.

## Features

- **Case Sensitivity**: Supports both case-sensitive and case-insensitive keyword processing.
- **Efficient Extraction**: Offers efficient extraction of keywords from large text datasets.
- **Flexible Keyword Addition**: Allows adding keywords from various sources including files, dictionaries, or directly as strings.
- **Optimized Trie Structure**: Utilizes a custom Trie data structure for optimized search performance.

## Usage

Here is a basic example of how to use the reworked KeywordProcessor:

```python
from keyword_processor import KeywordProcessor

# Initialize the processor with case sensitivity preference
processor = KeywordProcessor(case_sensitive=False)

# Add keywords and their respective clean names
processor.add_keyword("Python", "Python Programming")
processor.add_keyword("Java", "Java Programming")

# Extract keywords from text
text = "I love Python and Java programming languages."
extracted_keywords = processor.extract_keywords(text)

print(extracted_keywords) # Output: ['Python Programming', 'Java Programming']
```
Empty file added keyword_processor/__init__.py
Empty file.
147 changes: 147 additions & 0 deletions keyword_processor/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
from collections import defaultdict
from pathlib import Path


class TrieNode:
"""A node in the Trie structure used for efficient keyword searching.
Attributes:
children (dict): A dictionary mapping characters to their corresponding TrieNode.
is_end_of_word (bool): Indicates if the node represents the end of a keyword.
clean_name (str | None): The associated clean name for the keyword represented by this node.
"""

def __init__(self) -> None:
self.children: dict[str, TrieNode] = defaultdict(TrieNode)
self.is_end_of_word: bool = False
self.clean_name: str | None = None


class KeywordProcessor:
"""Processes and extracts keywords from text using a Trie structure.
This class provides methods to add keywords, normalize text, and extract keywords from sentences.
Attributes:
case_sensitive (bool): Determines if keyword matching is case-sensitive.
root (TrieNode): The root node of the Trie structure.
keyword_map (dict): A map from keywords to their clean names.
Args:
case_sensitive (bool): Specifies if the keyword matching should be case-sensitive. Defaults to False.
"""

def __init__(self, case_sensitive: bool = False) -> None:
self.case_sensitive: bool = case_sensitive
self.root: TrieNode = TrieNode()
self.keyword_map: dict[str, str] = {}

def _normalize(self, text: str) -> str:
"""
Normalizes the text based on case sensitivity.
Args:
text (str): The text to normalize.
Returns:
str: Normalized text.
"""
return text if self.case_sensitive else text.lower()

def add_keyword(self, keyword: str, clean_name: str | None = None) -> None:
"""
Adds a keyword to the trie and keyword map.
Args:
keyword (str): The keyword to add.
clean_name (str | None): The clean name associated with the keyword. Defaults to None.
"""
clean_name = clean_name or keyword
keyword = self._normalize(keyword)
self.keyword_map[keyword] = clean_name

node = self.root
for char in keyword:
node = node.children[char]

node.is_end_of_word = True
node.clean_name = clean_name

def add_keyword_from_file(self, keyword_file: str, encoding: str = "utf-8") -> None:
"""
Adds keywords from a file.
Args:
keyword_file (str): Path to the file containing keywords.
encoding (str): The encoding of the file. Defaults to "utf-8".
"""
file_path = Path(keyword_file)

if not file_path.is_file():
raise OSError(f"Invalid file path {keyword_file}")

with file_path.open(encoding=encoding) as file:
for line in file:
parts = line.split("=>")
keyword = parts[0].strip()
clean_name = parts[1].strip() if len(parts) > 1 and parts[1] else None
self.add_keyword(keyword, clean_name)

def add_keywords_from_dict(self, keyword_dict: dict[str, list[str]]) -> None:
"""
Adds multiple keywords from a dictionary.
Args:
keyword_dict (dict[str, list[str]]): Dictionary where each key is a clean name and associated value is a list of keywords.
"""
for clean_name, keywords in keyword_dict.items():
for keyword in keywords:
self.add_keyword(keyword, clean_name)

def _is_word_boundary(self, sentence: str, start_idx: int, end_idx: int) -> bool:
"""
Checks if a given index is a word boundary in the sentence.
Args:
sentence (str): The sentence to check within.
start_idx (int): The starting index of the word.
end_idx (int): The ending index of the word.
Returns:
bool: True if the indices represent a word boundary, False otherwise.
"""
if start_idx > 0 and sentence[start_idx - 1].isalnum():
return False
if end_idx < len(sentence) and sentence[end_idx].isalnum():
return False
return True

def extract_keywords(self, sentence: str, span_info: bool = False) -> list[str | tuple[str, int, int]]:
"""
Extracts keywords from a sentence.
Args:
sentence (str): The sentence to extract keywords from.
span_info (bool): If True, returns a list of tuples with the keyword, start index, and end index. Defaults to False.
Returns:
list[str | tuple[str, int, int]]: A list of extracted keywords or tuples containing the keyword and its span in the sentence.
"""
sentence = self._normalize(sentence)
results = []

for start_pos in range(len(sentence)):
node = self.root
for end_pos in range(start_pos, len(sentence)):
char = sentence[end_pos]
if char in node.children:
node = node.children[char]
if node.is_end_of_word and self._is_word_boundary(sentence, start_pos, end_pos + 1):
clean_name = node.clean_name
match = (clean_name, start_pos, end_pos + 1) if span_info else clean_name
results.append(match)
break
else:
break

return results
55 changes: 55 additions & 0 deletions keyword_processor/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest

from .processor import KeywordProcessor


class TestKeywordProcessor:
@pytest.fixture
def processor(self):
return KeywordProcessor(case_sensitive=False)

def test_add_keyword(self, processor):
processor.add_keyword("Python", "Python Programming")
assert "python" in processor.keyword_map
assert processor.keyword_map["python"] == "Python Programming"

def test_case_sensitivity(self):
processor = KeywordProcessor(case_sensitive=True)
processor.add_keyword("Python", "Python Programming")
assert "Python" in processor.keyword_map
assert "python" not in processor.keyword_map

def test_extract_keywords(self, processor):
processor.add_keyword("python", "Python Programming")
processor.add_keyword("java", "Java Programming")
extracted_keywords = processor.extract_keywords("I love Python and Java.")
assert extracted_keywords == ["Python Programming", "Java Programming"]

def test_add_keywords_from_dict(self, processor):
keyword_dict = {"Programming": ["Python", "Java", "C++"]}
processor.add_keywords_from_dict(keyword_dict)
assert "python" in processor.keyword_map
assert "java" in processor.keyword_map
assert "c++" in processor.keyword_map
assert processor.keyword_map["python"] == "Programming"

def test_add_keyword_from_file(self, processor, tmpdir):
# Create a temporary keyword file
keyword_file = tmpdir.join("keywords.txt")
keyword_file.write("Python=>Python Programming\nJava=>Java Programming")
processor.add_keyword_from_file(str(keyword_file))
assert "python" in processor.keyword_map
assert "java" in processor.keyword_map
assert processor.keyword_map["python"] == "Python Programming"
assert processor.keyword_map["java"] == "Java Programming"

def test_add_keyword_from_file_with_invalid_path(self, processor):
invalid_path = "invalid/path/to/file.txt"
with pytest.raises(IOError) as excinfo:
processor.add_keyword_from_file(invalid_path)
assert str(excinfo.value) == f"Invalid file path {invalid_path}"

def test_extract_keywords_with_span_info(self, processor):
processor.add_keyword("python", "Python Programming")
extracted_keywords = processor.extract_keywords("I love Python.", span_info=True)
assert extracted_keywords == [("Python Programming", 7, 13)]
38 changes: 14 additions & 24 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ django-storages = "^1.14.2"
djangorestframework = "^3.14.0"
djangorestframework-simplejwt = "^5.3.1"
drf-spectacular = "^0.26.5"
flashtext = "^2.7"
gunicorn = "^21.2.0"
pypdf = "^3.17.1"
python = "^3.12"
Expand Down
7 changes: 3 additions & 4 deletions resume_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

from django.core.cache import cache
from django.core.files.uploadedfile import InMemoryUploadedFile
from flashtext import KeywordProcessor
from pypdf import PdfReader

from keyword_processor import KeywordProcessor
from scrapers.models import Skill, Vacancy


Expand All @@ -25,9 +25,8 @@ def find_skills_in_resume(text_from_resume: str) -> set[str]:
cache.set("skills_from_db", skills_from_db, 12 * 60 * 60)

keyword_processor = KeywordProcessor()
for clean_name, unclean_names in skills_from_db:
for unclean_name in unclean_names:
keyword_processor.add_keyword(unclean_name, clean_name)
dict_of_skills = {clean_name: unclean_names for clean_name, unclean_names in skills_from_db}
keyword_processor.add_keywords_from_dict(dict_of_skills)

skills_from_resume = set(keyword_processor.extract_keywords(text_from_resume))
return skills_from_resume
Expand Down
5 changes: 3 additions & 2 deletions scrapers/management/shared/base_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from collections import Counter
from urllib.parse import urlencode

from flashtext import KeywordProcessor
from playwright.sync_api import ElementHandle, Page
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from tenacity import RetryError, Retrying, stop_after_attempt, wait_random_exponential

from keyword_processor import KeywordProcessor

from .utils import get_playwright_page

logger = logging.getLogger("django")
Expand Down Expand Up @@ -178,7 +179,7 @@ def process_vacancy_content(self, vacancy_without_skills: dict[str, str], keywor
Args:
vacancy_without_skills (dict[str, str]): A dictionary containing vacancy details.
keyword_processor (KeywordProcessor): An instance of flashtext.KeywordProcessor.
keyword_processor (KeywordProcessor): An instance of KeywordProcessor.
Returns:
dict: A dictionary with vacancy details and extracted skills.
Expand Down
2 changes: 1 addition & 1 deletion scrapers/management/shared/generic_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from django.core.management.base import BaseCommand
from django.db import OperationalError
from flashtext import KeywordProcessor

from keyword_processor import KeywordProcessor
from scrapers.models import Job, Skill, Vacancy

from .base_scraper import BaseScraper
Expand Down

0 comments on commit fff3892

Please sign in to comment.