wip

harelba · Aug 30, 2020 · 7abaab5 · 7abaab5
1 parent b2f8a0e
commit 7abaab5
Show file tree

Hide file tree

Showing 34 changed files with 1,204 additions and 162 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ packages
 dist/windows/
 _benchmark_data*
 *.benchmark-results
+generated-site/
diff --git a/.travis.yml b/.travis.yml
@@ -1,11 +1,136 @@
-language: python
-python:
-  - "2.7"
-  - "3.6"
-matrix:
+sudo: false
+
+stages:
+  - integration
+  - release
+
+env:
+  global:
+    - CACHE_NAME=${TRAVIS_JOB_NAME}
+
+
+_commands_provider:
+
+  _test: &_test make test
+
+  _lint: &_lint make lint
+
+  _release: &_release make local-release
+
+  _install_requirements: &_install_requirements make dep
+
+  # https://ttcshelbyville.wordpress.com/2012/12/19/disable-remote-differential-compression-form-the-command-line/
+  _disable_windows_compression: &_disable_windows_compression "powershell Disable-WindowsOptionalFeature -Online -FeatureName MSRDC-Infrastructure"
+
+  # https://travis-ci.community/t/yarn-network-troubles/333/7
+  _disable_windows_defender: &_disable_windows_defender "powershell Set-MpPreference -DisableRealtimeMonitoring \\$true"
+
+
+_steps_provider:
+
+  _test: &_step_test
+
+    install:
+      - *_install_requirements
+    before_script: *_lint
+    script: *_test
+
+  _release: &_step_release
+
+    install: *_install_requirements
+    script: *_release
+
+
+
+jobs:
   include:
-    - python: "3.7"
-      dist: xenial  # Need for python 3.7
-install: pip install -r requirements.txt
-before_script: flake8 ./bin/q ./test/test-suite --count --select=E901,E999,F821,F822,F823 --show-source --statistics
-script: PYTHONIOENCODING=UTF-8 test/test-all
+    - stage: integration
+      name: py27-macos
+      os: osx
+      language: generic
+      osx_image: xcode7.3
+      env:
+        - PYENV_VERSION=2.7.14
+      before_install: source setup-pyenv.sh
+      <<: *_step_test
+      cache:
+        directories:
+          - ${HOME}/.pyenv_cache
+
+    - stage: integration
+      name: py36-macos
+      os: osx
+      language: generic
+      osx_image: xcode7.3
+      env:
+        - PYENV_VERSION=3.6.4
+      before_install: source setup-pyenv.sh
+      <<: *_step_test
+      cache:
+        directories:
+          - ${HOME}/.pyenv_cache
+
+    - stage: integration
+      name: py37-macos
+      os: osx
+      language: generic
+      osx_image: xcode7.3
+      env:
+        - PYENV_VERSION=3.7.3
+      before_install: source setup-pyenv.sh
+      <<: *_step_test
+      cache:
+        directories:
+          - ${HOME}/.pyenv_cache
+
+    - stage: integration
+      name: py27-linux
+      language: python
+      python: "2.7"
+      <<: *_step_test
+
+    - stage: integration
+      name: py36-linux
+      language: python
+      python: "3.6"
+      <<: *_step_test
+
+    - stage: integration
+      name: py37-linux
+      language: python
+      dist: xenial
+      python: "3.7"
+      <<: *_step_test
+
+    - stage: release
+      name: macos
+      os: osx
+      language: generic
+      osx_image: xcode7.3
+      env:
+        - PYENV_VERSION=3.7.3
+      before_install: source setup-pyenv.sh
+      <<: *_step_release
+      cache:
+        directories:
+          - ${HOME}/.pyenv_cache
+
+    - stage: release
+      name: linux
+      language: python
+      dist: xenial
+      python: "3.7"
+      <<: *_step_release
+
+    - stage: release
+      name: windows
+      os: windows
+      language: shell
+      env:
+        - PATH=/c/Python37:/c/Python37/Scripts:$PATH
+      before_install:
+        - *_disable_windows_compression
+        - *_disable_windows_defender
+        - choco install make
+        - choco install python --version 3.7.3
+      <<: *_step_release
diff --git a/Makefile b/Makefile
@@ -0,0 +1,37 @@
+SHELL := /bin/bash
+
+PROJECT_NAME=$(shell dirname "$0")
+ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+
+.PHONY: test help
+.DEFAULT_GOAL := ci
+
+ci: lint test ## Equivelant to 'make lint test'
+
+help: ## Show this help message.
+
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+
+dep: ## Install the dependent libraries.
+
+	pip install -r test-requirements.txt
+	pip install -e .
+
+lint: dep ## Run lint validations.
+
+	flake8 q/ --count --select=E901,E999,F821,F822,F823 --show-source --statistics
+
+test: dep ## Run the unit tests.
+
+	test/test-all
+	## TODO Bring back pytest
+	## py.test -rs -c pytest.ini -s -v q/tests/suite.py --rootdir . 
+
+release: ## Run release
+	pip install py-ci
+	pyci release --no-wheel-publish --wheel-universal
+
+local-release:
+	pip install py-ci
+	./do-manual-release.sh
+
diff --git a/README.markdown b/README.markdown
@@ -10,7 +10,7 @@ q's web site is [http://harelba.github.io/q/](http://harelba.github.io/q/). It c
 ## Installation.
 Extremely simple. 
 
-Instructions for all OSs are [here](http://harelba.github.io/q/install.html). 
+Instructions for all OSs are [here](http://harelba.github.io/q/#installation). 
 
 ## Examples
 
@@ -20,18 +20,19 @@ q "SELECT COUNT(*) FROM ./clicks_file.csv WHERE c3 > 32.3"
 ps -ef | q -H "SELECT UID, COUNT(*) cnt FROM - GROUP BY UID ORDER BY cnt DESC LIMIT 3"
 ```
 
-Go [here](http://harelba.github.io/q/examples.html) for more examples.
+Go [here](http://harelba.github.io/q/#examples) for more examples.
 
 ## Python API
 A development branch for exposing q's capabilities as a <strong>Python module</strong> can be viewed <a href="https://github.com/harelba/q/tree/generic-injected-streams/PYTHON-API.markdown">here</a>, along with examples of the alpha version of the API.<br/>Existing functionality as a command-line tool will not be affected by this. Your input will be most appreciated.
 
-## Change log
-Click [here](http://harelba.github.io/q/changelog.html) to see the change log.
-
 ## Contact
 Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course.
 
-Harel Ben-Attia, [email protected], [@harelba](https://twitter.com/harelba) on Twitter
+Linkedin: [Harel Ben Attia](https://www.linkedin.com/in/harelba/)
+
+Twitter [@harelba](https://twitter.com/harelba)
+
+Email [[email protected]](mailto:[email protected])
 
 q on twitter: #qtextasdata
 
diff --git a/bin/__version__.py b/bin/__version__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+
+q_version = '2.0.12'
+
+
+if __name__ == '__main__':
+    print(q_version)
diff --git a/bin/q → bin/q.py b/bin/q → bin/q.py
@@ -30,8 +30,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-q_version = "2.0.6"
+from .__version__ import q_version
 
 __all__ = [ 'QTextAsData' ]
 
@@ -476,16 +475,18 @@ def __init__(self, mode, expected_column_count, input_delimiter, skip_header=Fal
         self.rows = []
         self.skip_header = skip_header
         self.header_row = None
+        self.header_row_filename = None
         self.expected_column_count = expected_column_count
         self.input_delimiter = input_delimiter
         self.disable_column_type_detection = disable_column_type_detection
 
-    def analyze(self, col_vals):
+    def analyze(self, filename, col_vals):
         if self.inferred:
             raise Exception("Already inferred columns")
 
         if self.skip_header and self.header_row is None:
             self.header_row = col_vals
+            self.header_row_filename = filename
         else:
             self.rows.append(col_vals)
 
@@ -905,17 +906,36 @@ def _pre_populate(self,dialect):
             mfs = MaterializedFileState(filename,f,self.encoding,dialect,is_stdin)
             self.materialized_file_dict[filename] = mfs
 
+    def _should_skip_extra_headers(self, filenumber, filename, mfs, col_vals):
+        if not self.skip_header:
+            return False
+
+        if filenumber == 0:
+            return False
+
+        header_already_exists = self.column_inferer.header_row is not None
+
+        is_extra_header = self.skip_header and mfs.lines_read == 1 and header_already_exists
+
+        if is_extra_header:
+            if tuple(self.column_inferer.header_row) != tuple(col_vals):
+                raise BadHeaderException("Extra header {} in file {} mismatches original header {} from file {}. Table name is {}".format(",".join(col_vals),mfs.filename,",".join(self.column_inferer.header_row),self.column_inferer.header_row_filename,self.filenames_str))
+
+        return is_extra_header
+
     def _populate(self,dialect,stop_after_analysis=False):
         total_data_lines_read = 0
 
         # For each match
-        for filename in self.materialized_file_list:
+        for filenumber,filename in enumerate(self.materialized_file_list):
             mfs = self.materialized_file_dict[filename]
 
             try:
                 try:
                     for col_vals in mfs.read_file_using_csv():
-                        self._insert_row(col_vals)
+                        if self._should_skip_extra_headers(filenumber,filename,mfs,col_vals):
+                            continue
+                        self._insert_row(filename, col_vals)
                         if stop_after_analysis and self.column_inferer.inferred:
                             return
                     if mfs.lines_read == 0 and self.skip_header:
@@ -937,7 +957,7 @@ def _populate(self,dialect,stop_after_analysis=False):
 
             if not self.table_created:
                 self.column_inferer.force_analysis()
-                self._do_create_table()
+                self._do_create_table(filename)
 
 
         if total_data_lines_read == 0:
@@ -960,20 +980,20 @@ def populate(self,dialect,stop_after_analysis=False):
             self.state = TableCreatorState.FULLY_READ
             return
 
-    def _flush_pre_creation_rows(self):
+    def _flush_pre_creation_rows(self, filename):
         for i, col_vals in enumerate(self.pre_creation_rows):
             if self.skip_header and i == 0:
                 # skip header line
                 continue
-            self._insert_row(col_vals)
+            self._insert_row(filename, col_vals)
         self._flush_inserts()
         self.pre_creation_rows = []
 
-    def _insert_row(self, col_vals):
+    def _insert_row(self, filename, col_vals):
         # If table has not been created yet
         if not self.table_created:
             # Try to create it along with another "example" line of data
-            self.try_to_create_table(col_vals)
+            self.try_to_create_table(filename, col_vals)
 
         # If the table is still not created, then we don't have enough data, just
         # store the data and return
@@ -1069,19 +1089,19 @@ def _flush_inserts(self):
         # print self.db.execute_and_fetch(self.db.generate_end_transaction())
         self.buffered_inserts = []
 
-    def try_to_create_table(self, col_vals):
+    def try_to_create_table(self, filename, col_vals):
         if self.table_created:
             raise Exception('Table is already created')
 
         # Add that line to the column inferer
-        result = self.column_inferer.analyze(col_vals)
+        result = self.column_inferer.analyze(filename, col_vals)
         # If inferer succeeded,
         if result:
-            self._do_create_table()
+            self._do_create_table(filename)
         else:
             pass  # We don't have enough information for creating the table yet
 
-    def _do_create_table(self):
+    def _do_create_table(self,filename):
         # Then generate a temp table name
         self.table_name = self.db.generate_temp_table_name()
         # Get the column definition dict from the inferer
@@ -1101,7 +1121,7 @@ def _do_create_table(self):
         self.db.execute_and_fetch(create_table_stmt)
         # Mark the table as created
         self.table_created = True
-        self._flush_pre_creation_rows()
+        self._flush_pre_creation_rows(filename)
 
     def drop_table(self):
         if self.table_created:
@@ -1122,7 +1142,8 @@ def determine_max_col_lengths(m,output_field_quoting_func,output_delimiter):
 
 def print_credentials():
     print("q version %s" % q_version, file=sys.stderr)
-    print("Copyright (C) 2012-2017 Harel Ben-Attia ([email protected], @harelba on twitter)", file=sys.stderr)
+    print("Python: %s" % " // ".join([str(x).strip() for x in sys.version.split("\n")]), file=sys.stderr)
+    print("Copyright (C) 2012-2019 Harel Ben-Attia ([email protected], @harelba on twitter)", file=sys.stderr)
     print("http://harelba.github.io/q/", file=sys.stderr)
     print(file=sys.stderr)
 
@@ -1403,7 +1424,7 @@ def _execute(self,query_str,input_params=None,stdin_file=None,stdin_filename='-'
             msg = str(e)
             error = QError(e,"query error: %s" % msg,1)
             if "no such column" in msg and effective_input_params.skip_header:
-                warnings.append(QWarning(e,'Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names'))
+                warnings.append(QWarning(e,'Warning - There seems to be a "no such column" error, and -H (header line) exists. Please make sure that you are using the column names from the header line and not the default (cXX) column names. Another issue might be that the file contains a BOM. Files that are encoded with UTF8 and contain a BOM can be read by specifying `-e utf-9-sig` in the command line. Support for non-UTF8 encoding will be provided in the future.'))
         except ColumnCountMismatchException as e:
             error = QError(e,e.msg,2)
         except (UnicodeDecodeError, UnicodeError) as e: