Merge pull request #27 from vaaaaanquish/multithread

Set default parser
vaaaaanquish · May 9, 2020 · 49d14df · 49d14df
2 parents 9ed680a + a260443
commit 49d14df
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 12 deletions.
diff --git a/cloudia/main.py b/cloudia/main.py
@@ -15,10 +15,11 @@ def __init__(self,
                  stop_words: List[str] = STOPWORDS,
                  extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'],
                  parse_func: Any = default_parse_func,
-                 multiprocess: bool = True,
+                 parser: Any = 'default',
+                 multiprocess: bool = False,
                  individual: bool = False,
                  **args):
-        args.update(dict(single_words=single_words, stop_words=stop_words, extract_postags=extract_postags))
+        args.update(dict(single_words=single_words, stop_words=stop_words, extract_postags=extract_postags, parser=parser))
         self.wd = WordData(data, parse_func, multiprocess, individual, **args)
 
     def make_wordcloud(self, dark_theme: bool, rate: int) -> List[Tuple[str, WordCloud]]:

diff --git a/cloudia/pandas_accessor.py b/cloudia/pandas_accessor.py
@@ -18,14 +18,15 @@ def plot(self,
              stop_words: List[str] = STOPWORDS,
              extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'],
              parse_func: Any = default_parse_func,
+             parser: Any = 'default',
              dark_theme: bool = False,
              title_size: int = 12,
              row_num: int = 3,
              figsize_rate: int = 2,
-             multiprocess: bool = True,
+             multiprocess: bool = False,
              individual: bool = False,
              **args):
-        Cloudia(self.df, single_words, stop_words, extract_postags, parse_func, multiprocess, individual,
+        Cloudia(self.df, single_words, stop_words, extract_postags, parse_func, parser, multiprocess, individual,
                 **args).plot(dark_theme, title_size, row_num, figsize_rate)
 
     def save(self, fig_path: str, dark_theme: bool, **args: Any):
@@ -43,14 +44,15 @@ def plot(self,
              stop_words: List[str] = STOPWORDS,
              extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'],
              parse_func: Any = default_parse_func,
+             parser: Any = 'default',
              dark_theme: bool = False,
              title_size: int = 12,
              row_num: int = 3,
              figsize_rate: int = 2,
-             multiprocess: bool = True,
+             multiprocess: bool = False,
              individual: bool = False,
              **args):
-        Cloudia(self.series, single_words, stop_words, extract_postags, parse_func, multiprocess, individual,
+        Cloudia(self.series, single_words, stop_words, extract_postags, parse_func, parser, multiprocess, individual,
                 **args).plot(dark_theme, title_size, row_num, figsize_rate)
 
     def save(self, fig_path: str, dark_theme: bool, **args: Any):

diff --git a/cloudia/utils.py b/cloudia/utils.py
@@ -11,8 +11,13 @@
 NUM_REGEX = re.compile('^[0-9]+$')
 
 
-def default_parse_func(text: str, single_words: List[str], extract_postags: List[str], stop_words: List[str]) -> List[str]:
-    parser = nagisa.Tagger(single_word_list=single_words)
+def make_nagisa_tagger(single_words: List[str]):
+    return nagisa.Tagger(single_word_list=single_words)
+
+
+def default_parse_func(text: str, single_words: List[str], extract_postags: List[str], stop_words: List[str], parser) -> List[str]:
+    if parser == 'default':
+        parser = make_nagisa_tagger(single_words)
     for x in ['"', ';', ',', '(', ')', '\u3000']:
         text = text.replace(x, ' ')
     text = text.lower()

diff --git a/cloudia/word_data.py b/cloudia/word_data.py
@@ -5,7 +5,7 @@
 from joblib import Parallel, delayed
 import pandas as pd
 
-from cloudia.utils import function_wrapper
+from cloudia.utils import function_wrapper, make_nagisa_tagger
 
 
 class WordData:
@@ -42,6 +42,8 @@ def _parse(self, words: List[str], parse_func: Callable[..., List[str]], multipr
         return self._single_thread_parse(words, parse_func, **args)
 
     def _single_thread_parse(self, words: List[str], parse_func: Callable[..., List[str]], **args) -> List[Counter]:
+        if args['parser'] == 'default':
+            args.update({'parser': make_nagisa_tagger(args['single_words'])})
         return [Counter(parse_func(x, **args)) for x in words]
 
     def _parallel_parse(self, words: List[str], parse_func: Callable, **args) -> List[List[Counter]]:

diff --git a/test/unit_test/test_utils.py b/test/unit_test/test_utils.py
@@ -5,7 +5,7 @@
 
 class TestUtils(unittest.TestCase):
     def test_default_parse_func(self):
-        output = default_parse_func('This is a simple test.', ['simple test'], ['英単語'], ['is'])
+        output = default_parse_func('This is a simple test.', ['simple test'], ['英単語'], ['is'], 'default')
         self.assertListEqual(output, ['this', 'simple\u3000test'])
 
     def test_function_wrapper(self):

diff --git a/test/unit_test/test_word_data.py b/test/unit_test/test_word_data.py
@@ -79,10 +79,10 @@ def test_convert_weight(self):
         self.assertDictEqual(output, {'hoge': 1, 'piyo': 0.5})
 
     def test_single_thread_parse(self):
-        def f(x):
+        def f(x, parser, single_words):
             return x.split(' ')
 
-        output = self.cls._single_thread_parse(['hoge hoge', 'piyo'], f)
+        output = self.cls._single_thread_parse(['hoge hoge', 'piyo'], f, **{'parser': 'default', 'single_words': []})
         target = [Counter(['hoge', 'hoge']), Counter(['piyo'])]
         for o, t in zip(output, target):
             self.assertEqual(type(o), type(t))