Skip to content

Commit

Permalink
Merge pull request #27 from vaaaaanquish/multithread
Browse files Browse the repository at this point in the history
Set default parser
  • Loading branch information
vaaaaanquish authored May 9, 2020
2 parents 9ed680a + a260443 commit 49d14df
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 12 deletions.
5 changes: 3 additions & 2 deletions cloudia/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ def __init__(self,
stop_words: List[str] = STOPWORDS,
extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'],
parse_func: Any = default_parse_func,
multiprocess: bool = True,
parser: Any = 'default',
multiprocess: bool = False,
individual: bool = False,
**args):
args.update(dict(single_words=single_words, stop_words=stop_words, extract_postags=extract_postags))
args.update(dict(single_words=single_words, stop_words=stop_words, extract_postags=extract_postags, parser=parser))
self.wd = WordData(data, parse_func, multiprocess, individual, **args)

def make_wordcloud(self, dark_theme: bool, rate: int) -> List[Tuple[str, WordCloud]]:
Expand Down
10 changes: 6 additions & 4 deletions cloudia/pandas_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@ def plot(self,
stop_words: List[str] = STOPWORDS,
extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'],
parse_func: Any = default_parse_func,
parser: Any = 'default',
dark_theme: bool = False,
title_size: int = 12,
row_num: int = 3,
figsize_rate: int = 2,
multiprocess: bool = True,
multiprocess: bool = False,
individual: bool = False,
**args):
Cloudia(self.df, single_words, stop_words, extract_postags, parse_func, multiprocess, individual,
Cloudia(self.df, single_words, stop_words, extract_postags, parse_func, parser, multiprocess, individual,
**args).plot(dark_theme, title_size, row_num, figsize_rate)

def save(self, fig_path: str, dark_theme: bool, **args: Any):
Expand All @@ -43,14 +44,15 @@ def plot(self,
stop_words: List[str] = STOPWORDS,
extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'],
parse_func: Any = default_parse_func,
parser: Any = 'default',
dark_theme: bool = False,
title_size: int = 12,
row_num: int = 3,
figsize_rate: int = 2,
multiprocess: bool = True,
multiprocess: bool = False,
individual: bool = False,
**args):
Cloudia(self.series, single_words, stop_words, extract_postags, parse_func, multiprocess, individual,
Cloudia(self.series, single_words, stop_words, extract_postags, parse_func, parser, multiprocess, individual,
**args).plot(dark_theme, title_size, row_num, figsize_rate)

def save(self, fig_path: str, dark_theme: bool, **args: Any):
Expand Down
9 changes: 7 additions & 2 deletions cloudia/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,13 @@
NUM_REGEX = re.compile('^[0-9]+$')


def default_parse_func(text: str, single_words: List[str], extract_postags: List[str], stop_words: List[str]) -> List[str]:
parser = nagisa.Tagger(single_word_list=single_words)
def make_nagisa_tagger(single_words: List[str]):
return nagisa.Tagger(single_word_list=single_words)


def default_parse_func(text: str, single_words: List[str], extract_postags: List[str], stop_words: List[str], parser) -> List[str]:
if parser == 'default':
parser = make_nagisa_tagger(single_words)
for x in ['"', ';', ',', '(', ')', '\u3000']:
text = text.replace(x, ' ')
text = text.lower()
Expand Down
4 changes: 3 additions & 1 deletion cloudia/word_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from joblib import Parallel, delayed
import pandas as pd

from cloudia.utils import function_wrapper
from cloudia.utils import function_wrapper, make_nagisa_tagger


class WordData:
Expand Down Expand Up @@ -42,6 +42,8 @@ def _parse(self, words: List[str], parse_func: Callable[..., List[str]], multipr
return self._single_thread_parse(words, parse_func, **args)

def _single_thread_parse(self, words: List[str], parse_func: Callable[..., List[str]], **args) -> List[Counter]:
if args['parser'] == 'default':
args.update({'parser': make_nagisa_tagger(args['single_words'])})
return [Counter(parse_func(x, **args)) for x in words]

def _parallel_parse(self, words: List[str], parse_func: Callable, **args) -> List[List[Counter]]:
Expand Down
2 changes: 1 addition & 1 deletion test/unit_test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class TestUtils(unittest.TestCase):
def test_default_parse_func(self):
output = default_parse_func('This is a simple test.', ['simple test'], ['英単語'], ['is'])
output = default_parse_func('This is a simple test.', ['simple test'], ['英単語'], ['is'], 'default')
self.assertListEqual(output, ['this', 'simple\u3000test'])

def test_function_wrapper(self):
Expand Down
4 changes: 2 additions & 2 deletions test/unit_test/test_word_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ def test_convert_weight(self):
self.assertDictEqual(output, {'hoge': 1, 'piyo': 0.5})

def test_single_thread_parse(self):
def f(x):
def f(x, parser, single_words):
return x.split(' ')

output = self.cls._single_thread_parse(['hoge hoge', 'piyo'], f)
output = self.cls._single_thread_parse(['hoge hoge', 'piyo'], f, **{'parser': 'default', 'single_words': []})
target = [Counter(['hoge', 'hoge']), Counter(['piyo'])]
for o, t in zip(output, target):
self.assertEqual(type(o), type(t))
Expand Down

0 comments on commit 49d14df

Please sign in to comment.