Merge pull request #26 from vaaaaanquish/individual

selective individual word when parse
vaaaaanquish · May 8, 2020 · 9ed680a · 9ed680a
2 parents a0f645a + e6189d0
commit 9ed680a
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -92,7 +92,8 @@ Cloudia(
   stop_words=STOPWORDS,    # not count words, default is wordcloud.STOPWORDS
   extract_postags=['名詞', '英単語', 'ローマ字文'],    # part of speech for japanese
   parse_func=None,    # split text function, example: lambda x: x.split(',')
-  multiprocess=True    # Flag for using multiprocessing
+  multiprocess=True,    # Flag for using multiprocessing
+  individual=False    # flag for ' '.join(word) with parse 
 )
 ```
 
@@ -126,6 +127,7 @@ DataFrame.wc.plot(
   extract_postags=['名詞', '英単語', 'ローマ字文'],    # part of speech for japanese
   parse_func=None,    # split text function, example: lambda x: x.split(',')
   multiprocess=True,    # Flag for using multiprocessing
+  individual=False,    # flag for ' '.join(word) with parse 
   dark_theme=False,    # color theme
   title_size=12,     # title text size
   row_num=3,    # for example, 12 wordcloud, row_num=3 -> 4*3image

diff --git a/cloudia/main.py b/cloudia/main.py
@@ -16,9 +16,10 @@ def __init__(self,
                  extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'],
                  parse_func: Any = default_parse_func,
                  multiprocess: bool = True,
+                 individual: bool = False,
                  **args):
         args.update(dict(single_words=single_words, stop_words=stop_words, extract_postags=extract_postags))
-        self.wd = WordData(data, parse_func, multiprocess, **args)
+        self.wd = WordData(data, parse_func, multiprocess, individual, **args)
 
     def make_wordcloud(self, dark_theme: bool, rate: int) -> List[Tuple[str, WordCloud]]:
         wordcloud_list = []

diff --git a/cloudia/pandas_accessor.py b/cloudia/pandas_accessor.py
@@ -22,8 +22,11 @@ def plot(self,
              title_size: int = 12,
              row_num: int = 3,
              figsize_rate: int = 2,
-             multiprocess: bool = True):
-        Cloudia(self.df, single_words, stop_words, extract_postags, parse_func, multiprocess).plot(dark_theme, title_size, row_num, figsize_rate)
+             multiprocess: bool = True,
+             individual: bool = False,
+             **args):
+        Cloudia(self.df, single_words, stop_words, extract_postags, parse_func, multiprocess, individual,
+                **args).plot(dark_theme, title_size, row_num, figsize_rate)
 
     def save(self, fig_path: str, dark_theme: bool, **args: Any):
         self.plot(**args)
@@ -44,8 +47,11 @@ def plot(self,
              title_size: int = 12,
              row_num: int = 3,
              figsize_rate: int = 2,
-             multiprocess: bool = True):
-        Cloudia(self.series, single_words, stop_words, extract_postags, parse_func, multiprocess).plot(dark_theme, title_size, row_num, figsize_rate)
+             multiprocess: bool = True,
+             individual: bool = False,
+             **args):
+        Cloudia(self.series, single_words, stop_words, extract_postags, parse_func, multiprocess, individual,
+                **args).plot(dark_theme, title_size, row_num, figsize_rate)
 
     def save(self, fig_path: str, dark_theme: bool, **args: Any):
         self.plot(**args)

diff --git a/cloudia/word_data.py b/cloudia/word_data.py
@@ -9,18 +9,22 @@
 
 
 class WordData:
-    def __init__(self, data: Any, parse_func: Callable[..., List[str]], multiprocess: bool, **args):
+    def __init__(self, data: Any, parse_func: Callable[..., List[str]], multiprocess: bool, individual: bool, **args):
         words, self.names = self._init_data(data)
-        self.counter_list = self.parse(words, parse_func, multiprocess, **args)
+        self.counter_list = self.parse(words, parse_func, multiprocess, individual, **args)
         self.words = [self.convert_weight(x) for x in self.counter_list]
 
-    def parse(self, words, parse_func: Callable[..., List[str]], multiprocess: bool, **args) -> List[Counter]:
+    def parse(self, words, parse_func: Callable[..., List[str]], multiprocess: bool, individual: bool, **args) -> List[Counter]:
         if isinstance(words[0], list):
             word_list_length = len(words[0])
-            words = list(chain.from_iterable(words))
-            words = self._parse(words, parse_func, multiprocess, **args)
-            words = list(zip_longest(*[iter(words)] * word_list_length))
-            words = [sum(w, Counter()) for w in words]
+            if individual:
+                words = list(chain.from_iterable(words))
+                words = self._parse(words, parse_func, multiprocess, **args)
+                words = list(zip_longest(*[iter(words)] * word_list_length))
+                words = [sum(w, Counter()) for w in words]
+            else:
+                words = [' '.join(x) for x in words]
+                words = self._parse(words, parse_func, multiprocess, **args)
         else:
             words = self._parse(words, parse_func, multiprocess, **args)
         return words

diff --git a/test/unit_test/test_word_data.py b/test/unit_test/test_word_data.py
@@ -7,7 +7,7 @@
 
 class TestWordData(unittest.TestCase):
     def setUp(self):
-        self.cls = WordData('test', lambda x: [x], True)
+        self.cls = WordData('test', lambda x: [x], True, False)
 
     def assertSortTextEqual(self, data, target):
         """for random sample list."""
@@ -60,15 +60,15 @@ def _parse(x, y, z, **args):
             return x
 
         with patch('cloudia.word_data.WordData._parse', side_effect=_parse):
-            output = self.cls.parse(['hoge hoge', 'piyo'], None, None)
+            output = self.cls.parse(['hoge hoge', 'piyo'], None, None, False)
             self.assertListEqual(output, ['hoge hoge', 'piyo'])
 
     def test_parse_list_case(self):
         def _parse(x, y, z, **args):
             return [Counter(w.split(' ')) for w in x]
 
         with patch('cloudia.word_data.WordData._parse', side_effect=_parse):
-            output = self.cls.parse([['hoge hoge', 'piyo'], ['fuga', 'fuga']], None, None)
+            output = self.cls.parse([['hoge hoge', 'piyo'], ['fuga', 'fuga']], None, None, False)
             target = [Counter({'hoge': 2, 'piyo': 1}), Counter({'fuga': 2})]
             for o, t in zip(output, target):
                 self.assertEqual(type(o), type(t))