-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Replace an outdated FlashText library with a custom implementation
- Loading branch information
Showing
9 changed files
with
256 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Reworked FlashText Keyword Processor | ||
|
||
## Overview | ||
|
||
This module presents a reworked and optimized version of the [FlashText](https://pypi.org/project/flashtext/) library for Python. It aims to provide a modern, efficient, and easy-to-maintain implementation for keyword extraction in large texts. | ||
|
||
## Features | ||
|
||
- **Case Sensitivity**: Supports both case-sensitive and case-insensitive keyword processing. | ||
- **Efficient Extraction**: Offers efficient extraction of keywords from large text datasets. | ||
- **Flexible Keyword Addition**: Allows adding keywords from various sources including files, dictionaries, or directly as strings. | ||
- **Optimized Trie Structure**: Utilizes a custom Trie data structure for optimized search performance. | ||
|
||
## Usage | ||
|
||
Here is a basic example of how to use the reworked KeywordProcessor: | ||
|
||
```python | ||
from keyword_processor import KeywordProcessor | ||
|
||
# Initialize the processor with case sensitivity preference | ||
processor = KeywordProcessor(case_sensitive=False) | ||
|
||
# Add keywords and their respective clean names | ||
processor.add_keyword("Python", "Python Programming") | ||
processor.add_keyword("Java", "Java Programming") | ||
|
||
# Extract keywords from text | ||
text = "I love Python and Java programming languages." | ||
extracted_keywords = processor.extract_keywords(text) | ||
|
||
print(extracted_keywords) # Output: ['Python Programming', 'Java Programming'] | ||
``` |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
from collections import defaultdict | ||
from pathlib import Path | ||
|
||
|
||
class TrieNode: | ||
"""A node in the Trie structure used for efficient keyword searching. | ||
Attributes: | ||
children (dict): A dictionary mapping characters to their corresponding TrieNode. | ||
is_end_of_word (bool): Indicates if the node represents the end of a keyword. | ||
clean_name (str | None): The associated clean name for the keyword represented by this node. | ||
""" | ||
|
||
def __init__(self) -> None: | ||
self.children: dict[str, TrieNode] = defaultdict(TrieNode) | ||
self.is_end_of_word: bool = False | ||
self.clean_name: str | None = None | ||
|
||
|
||
class KeywordProcessor: | ||
"""Processes and extracts keywords from text using a Trie structure. | ||
This class provides methods to add keywords, normalize text, and extract keywords from sentences. | ||
Attributes: | ||
case_sensitive (bool): Determines if keyword matching is case-sensitive. | ||
root (TrieNode): The root node of the Trie structure. | ||
keyword_map (dict): A map from keywords to their clean names. | ||
Args: | ||
case_sensitive (bool): Specifies if the keyword matching should be case-sensitive. Defaults to False. | ||
""" | ||
|
||
def __init__(self, case_sensitive: bool = False) -> None: | ||
self.case_sensitive: bool = case_sensitive | ||
self.root: TrieNode = TrieNode() | ||
self.keyword_map: dict[str, str] = {} | ||
|
||
def _normalize(self, text: str) -> str: | ||
""" | ||
Normalizes the text based on case sensitivity. | ||
Args: | ||
text (str): The text to normalize. | ||
Returns: | ||
str: Normalized text. | ||
""" | ||
return text if self.case_sensitive else text.lower() | ||
|
||
def add_keyword(self, keyword: str, clean_name: str | None = None) -> None: | ||
""" | ||
Adds a keyword to the trie and keyword map. | ||
Args: | ||
keyword (str): The keyword to add. | ||
clean_name (str | None): The clean name associated with the keyword. Defaults to None. | ||
""" | ||
clean_name = clean_name or keyword | ||
keyword = self._normalize(keyword) | ||
self.keyword_map[keyword] = clean_name | ||
|
||
node = self.root | ||
for char in keyword: | ||
node = node.children[char] | ||
|
||
node.is_end_of_word = True | ||
node.clean_name = clean_name | ||
|
||
def add_keyword_from_file(self, keyword_file: str, encoding: str = "utf-8") -> None: | ||
""" | ||
Adds keywords from a file. | ||
Args: | ||
keyword_file (str): Path to the file containing keywords. | ||
encoding (str): The encoding of the file. Defaults to "utf-8". | ||
""" | ||
file_path = Path(keyword_file) | ||
|
||
if not file_path.is_file(): | ||
raise OSError(f"Invalid file path {keyword_file}") | ||
|
||
with file_path.open(encoding=encoding) as file: | ||
for line in file: | ||
parts = line.split("=>") | ||
keyword = parts[0].strip() | ||
clean_name = parts[1].strip() if len(parts) > 1 and parts[1] else None | ||
self.add_keyword(keyword, clean_name) | ||
|
||
def add_keywords_from_dict(self, keyword_dict: dict[str, list[str]]) -> None: | ||
""" | ||
Adds multiple keywords from a dictionary. | ||
Args: | ||
keyword_dict (dict[str, list[str]]): Dictionary where each key is a clean name and associated value is a list of keywords. | ||
""" | ||
for clean_name, keywords in keyword_dict.items(): | ||
for keyword in keywords: | ||
self.add_keyword(keyword, clean_name) | ||
|
||
def _is_word_boundary(self, sentence: str, start_idx: int, end_idx: int) -> bool: | ||
""" | ||
Checks if a given index is a word boundary in the sentence. | ||
Args: | ||
sentence (str): The sentence to check within. | ||
start_idx (int): The starting index of the word. | ||
end_idx (int): The ending index of the word. | ||
Returns: | ||
bool: True if the indices represent a word boundary, False otherwise. | ||
""" | ||
if start_idx > 0 and sentence[start_idx - 1].isalnum(): | ||
return False | ||
if end_idx < len(sentence) and sentence[end_idx].isalnum(): | ||
return False | ||
return True | ||
|
||
def extract_keywords(self, sentence: str, span_info: bool = False) -> list[str | tuple[str, int, int]]: | ||
""" | ||
Extracts keywords from a sentence. | ||
Args: | ||
sentence (str): The sentence to extract keywords from. | ||
span_info (bool): If True, returns a list of tuples with the keyword, start index, and end index. Defaults to False. | ||
Returns: | ||
list[str | tuple[str, int, int]]: A list of extracted keywords or tuples containing the keyword and its span in the sentence. | ||
""" | ||
sentence = self._normalize(sentence) | ||
results = [] | ||
|
||
for start_pos in range(len(sentence)): | ||
node = self.root | ||
for end_pos in range(start_pos, len(sentence)): | ||
char = sentence[end_pos] | ||
if char in node.children: | ||
node = node.children[char] | ||
if node.is_end_of_word and self._is_word_boundary(sentence, start_pos, end_pos + 1): | ||
clean_name = node.clean_name | ||
match = (clean_name, start_pos, end_pos + 1) if span_info else clean_name | ||
results.append(match) | ||
break | ||
else: | ||
break | ||
|
||
return results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import pytest | ||
|
||
from .processor import KeywordProcessor | ||
|
||
|
||
class TestKeywordProcessor: | ||
@pytest.fixture | ||
def processor(self): | ||
return KeywordProcessor(case_sensitive=False) | ||
|
||
def test_add_keyword(self, processor): | ||
processor.add_keyword("Python", "Python Programming") | ||
assert "python" in processor.keyword_map | ||
assert processor.keyword_map["python"] == "Python Programming" | ||
|
||
def test_case_sensitivity(self): | ||
processor = KeywordProcessor(case_sensitive=True) | ||
processor.add_keyword("Python", "Python Programming") | ||
assert "Python" in processor.keyword_map | ||
assert "python" not in processor.keyword_map | ||
|
||
def test_extract_keywords(self, processor): | ||
processor.add_keyword("python", "Python Programming") | ||
processor.add_keyword("java", "Java Programming") | ||
extracted_keywords = processor.extract_keywords("I love Python and Java.") | ||
assert extracted_keywords == ["Python Programming", "Java Programming"] | ||
|
||
def test_add_keywords_from_dict(self, processor): | ||
keyword_dict = {"Programming": ["Python", "Java", "C++"]} | ||
processor.add_keywords_from_dict(keyword_dict) | ||
assert "python" in processor.keyword_map | ||
assert "java" in processor.keyword_map | ||
assert "c++" in processor.keyword_map | ||
assert processor.keyword_map["python"] == "Programming" | ||
|
||
def test_add_keyword_from_file(self, processor, tmpdir): | ||
# Create a temporary keyword file | ||
keyword_file = tmpdir.join("keywords.txt") | ||
keyword_file.write("Python=>Python Programming\nJava=>Java Programming") | ||
processor.add_keyword_from_file(str(keyword_file)) | ||
assert "python" in processor.keyword_map | ||
assert "java" in processor.keyword_map | ||
assert processor.keyword_map["python"] == "Python Programming" | ||
assert processor.keyword_map["java"] == "Java Programming" | ||
|
||
def test_add_keyword_from_file_with_invalid_path(self, processor): | ||
invalid_path = "invalid/path/to/file.txt" | ||
with pytest.raises(IOError) as excinfo: | ||
processor.add_keyword_from_file(invalid_path) | ||
assert str(excinfo.value) == f"Invalid file path {invalid_path}" | ||
|
||
def test_extract_keywords_with_span_info(self, processor): | ||
processor.add_keyword("python", "Python Programming") | ||
extracted_keywords = processor.extract_keywords("I love Python.", span_info=True) | ||
assert extracted_keywords == [("Python Programming", 7, 13)] |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters