From ce5eb4b8d1796a38b5187694d65d0f7e1e05cd0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 21 Mar 2024 08:43:58 +0000 Subject: [PATCH 01/71] [DOP-13779] Bump version --- onetl/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onetl/VERSION b/onetl/VERSION index 5eef0f10e..a3f5a8ed4 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.10.2 +0.10.3 From 14902fd1391c6d6983887451690e08647ad22def Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 25 Mar 2024 06:42:56 +0000 Subject: [PATCH 02/71] Bump the github-actions group with 1 update Bumps the github-actions group with 1 update: [mikefarah/yq](https://github.com/mikefarah/yq). Updates `mikefarah/yq` from 4.42.1 to 4.43.1 - [Release notes](https://github.com/mikefarah/yq/releases) - [Changelog](https://github.com/mikefarah/yq/blob/master/release_notes.txt) - [Commits](https://github.com/mikefarah/yq/compare/v4.42.1...v4.43.1) --- updated-dependencies: - dependency-name: mikefarah/yq dependency-type: direct:production update-type: version-update:semver-minor dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index e4579eb28..d8a0afcad 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -154,7 +154,7 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml @@ -184,7 +184,7 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml @@ -214,7 +214,7 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml @@ -244,7 +244,7 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml @@ -274,7 +274,7 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml @@ -304,7 +304,7 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml @@ -334,7 +334,7 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml @@ -364,7 +364,7 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml @@ -394,7 +394,7 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml @@ -424,7 +424,7 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml @@ -454,7 +454,7 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml @@ -484,7 +484,7 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml @@ -514,7 +514,7 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml @@ -544,7 +544,7 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml @@ -574,7 +574,7 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml @@ -604,7 +604,7 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml @@ -634,7 +634,7 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml @@ -664,7 +664,7 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.43.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml From 98fa254a7011d412d2bfa0f2bc2891f89560949e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Mar 2024 20:52:10 +0000 Subject: [PATCH 03/71] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/pyupgrade: v3.15.1 → v3.15.2](https://github.com/asottile/pyupgrade/compare/v3.15.1...v3.15.2) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 516ea1cf1..e7ea7d8a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -89,7 +89,7 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.15.1 + rev: v3.15.2 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] From cb40a74b0cab2261a29e304626a78658d1862f3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Mar 2024 07:39:59 +0000 Subject: [PATCH 04/71] Fix wemake-python-styleguide errors --- .pre-commit-config.yaml | 3 ++- setup.cfg | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e7ea7d8a9..591792724 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,6 +67,7 @@ repos: rev: v6.2.1 hooks: - id: beautysh + additional_dependencies: [setuptools] - repo: https://github.com/IamTheFij/docker-pre-commit rev: v3.0.1 @@ -89,7 +90,7 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.15.2 + rev: v3.15.1 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] diff --git a/setup.cfg b/setup.cfg index 96b763b7c..164c6ac3e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -273,7 +273,9 @@ ignore = # E704 multiple statements on one line: def func(): ... E704, # WPS220 Found too deep nesting: 46 > 20 - WPS220 + WPS220, +# WPS474 Found import object collision + WPS474 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = From f593f8a2216bcefde79c6f316e7e36f15d9cdccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Mar 2024 13:58:39 +0000 Subject: [PATCH 05/71] Fix wemake-python-styleguide errors --- setup.cfg | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 164c6ac3e..d1f770d78 100644 --- a/setup.cfg +++ b/setup.cfg @@ -272,8 +272,6 @@ ignore = # https://github.com/wemake-services/wemake-python-styleguide/issues/2847 # E704 multiple statements on one line: def func(): ... E704, -# WPS220 Found too deep nesting: 46 > 20 - WPS220, # WPS474 Found import object collision WPS474 From c3bf4cd11c59897641f9a33c7a52d8f2f3d3cc86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 28 Mar 2024 07:41:32 +0000 Subject: [PATCH 06/71] Update README --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index ffc68e52b..0095b69d6 100644 --- a/README.rst +++ b/README.rst @@ -53,6 +53,7 @@ Non-goals Requirements ------------ + * **Python 3.7 - 3.12** * PySpark 2.3.x - 3.5.x (depends on used connector) * Java 8+ (required by Spark, see below) From 1d080599c89fef9b7dde1556ef69537683f16928 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 06:55:27 +0000 Subject: [PATCH 07/71] Bump the github-actions group with 1 update Bumps the github-actions group with 1 update: [tj-actions/changed-files](https://github.com/tj-actions/changed-files). Updates `tj-actions/changed-files` from 43 to 44 - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v43...v44) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 46 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index d8a0afcad..5cf33028e 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -86,7 +86,7 @@ jobs: - name: Check if base files are changed id: changed-base - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/base/tracked.txt files_ignore_from_source_file: .github/workflows/data/base/ignored.txt @@ -97,7 +97,7 @@ jobs: - name: Check if db-related files are changed id: changed-db - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/db/tracked.txt files_ignore_from_source_file: .github/workflows/data/db/ignored.txt @@ -108,7 +108,7 @@ jobs: - name: Check if file-related files are changed id: changed-file - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/file/tracked.txt files_ignore_from_source_file: .github/workflows/data/file/ignored.txt @@ -119,7 +119,7 @@ jobs: - name: Check if file-df-related files are changed id: changed-file-df - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/file-df/tracked.txt files_ignore_from_source_file: .github/workflows/data/file-df/ignored.txt @@ -130,7 +130,7 @@ jobs: - name: Check if core files are changed id: changed-core - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/core/tracked.txt files_ignore_from_source_file: .github/workflows/data/core/ignored.txt @@ -160,7 +160,7 @@ jobs: - name: Check if Clickhouse files are changed id: changed-clickhouse - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/clickhouse/tracked.txt files_ignore_from_source_file: .github/workflows/data/clickhouse/ignored.txt @@ -190,7 +190,7 @@ jobs: - name: Check if Greenplum files are changed id: changed-greenplum - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/greenplum/tracked.txt files_ignore_from_source_file: .github/workflows/data/greenplum/ignored.txt @@ -220,7 +220,7 @@ jobs: - name: Check if Hive files are changed id: changed-hive - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/hive/tracked.txt files_ignore_from_source_file: .github/workflows/data/hive/ignored.txt @@ -250,7 +250,7 @@ jobs: - name: Check if Kafka files are changed id: changed-kafka - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/kafka/tracked.txt files_ignore_from_source_file: .github/workflows/data/kafka/ignored.txt @@ -280,7 +280,7 @@ jobs: - name: Check if LocalFS files are changed id: changed-local-fs - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/local-fs/tracked.txt files_ignore_from_source_file: .github/workflows/data/local-fs/ignored.txt @@ -310,7 +310,7 @@ jobs: - name: Check if MongoDB files are changed id: changed-mongodb - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/mongodb/tracked.txt files_ignore_from_source_file: .github/workflows/data/mongodb/ignored.txt @@ -340,7 +340,7 @@ jobs: - name: Check if MSSQL files are changed id: changed-mssql - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/mssql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mssql/ignored.txt @@ -370,7 +370,7 @@ jobs: - name: Check if MySQL files are changed id: changed-mysql - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/mysql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mysql/ignored.txt @@ -400,7 +400,7 @@ jobs: - name: Check if Oracle files are changed id: changed-oracle - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/oracle/tracked.txt files_ignore_from_source_file: .github/workflows/data/oracle/ignored.txt @@ -430,7 +430,7 @@ jobs: - name: Check if Postgres files are changed id: changed-postgres - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/postgres/tracked.txt files_ignore_from_source_file: .github/workflows/data/postgres/ignored.txt @@ -460,7 +460,7 @@ jobs: - name: Check if Teradata files are changed id: changed-teradata - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/teradata/tracked.txt files_ignore_from_source_file: .github/workflows/data/teradata/ignored.txt @@ -490,7 +490,7 @@ jobs: - name: Check if FTP files are changed id: changed-ftp - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/ftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftp/ignored.txt @@ -520,7 +520,7 @@ jobs: - name: Check if FTPS files are changed id: changed-ftps - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/ftps/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftps/ignored.txt @@ -550,7 +550,7 @@ jobs: - name: Check if HDFS files are changed id: changed-hdfs - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/hdfs/tracked.txt files_ignore_from_source_file: .github/workflows/data/hdfs/ignored.txt @@ -580,7 +580,7 @@ jobs: - name: Check if S3 files are changed id: changed-s3 - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/s3/tracked.txt files_ignore_from_source_file: .github/workflows/data/s3/ignored.txt @@ -610,7 +610,7 @@ jobs: - name: Check if SFTP files are changed id: changed-sftp - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/sftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/sftp/ignored.txt @@ -640,7 +640,7 @@ jobs: - name: Check if Samba files are changed id: changed-samba - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/samba/tracked.txt files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt @@ -670,7 +670,7 @@ jobs: - name: Check if WebDAV files are changed id: changed-webdav - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/webdav/tracked.txt files_ignore_from_source_file: .github/workflows/data/webdav/ignored.txt From 5e75acab77df351d5d45fb87562685d26c559263 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 21:05:53 +0000 Subject: [PATCH 08/71] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/macisamuele/language-formatters-pre-commit-hooks: v2.12.0 → v2.13.0](https://github.com/macisamuele/language-formatters-pre-commit-hooks/compare/v2.12.0...v2.13.0) - [github.com/asottile/pyupgrade: v3.15.1 → v3.15.2](https://github.com/asottile/pyupgrade/compare/v3.15.1...v3.15.2) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 591792724..b8aa37707 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -58,7 +58,7 @@ repos: args: [-w] - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks - rev: v2.12.0 + rev: v2.13.0 hooks: - id: pretty-format-yaml args: [--autofix, --indent, '2', --preserve-quotes, --offset, '2'] @@ -90,7 +90,7 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.15.1 + rev: v3.15.2 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] From 987a68c74f19dde4a1a267c9c16549771f1f46fa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Apr 2024 21:24:23 +0000 Subject: [PATCH 09/71] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/pre-commit-hooks: v4.5.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.5.0...v4.6.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b8aa37707..2d2c6ff79 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ default_language_version: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v4.6.0 hooks: - id: check-ast - id: check-case-conflict From d10f942b0502dddaf498e5afeb12c9fa50e700aa Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:52:33 +0300 Subject: [PATCH 10/71] [DOP-14546] - update Version to store raw str version (#248) * [DOP-14546] - update Version to store raw str version * [DOP-14546] - compare only Version instances * [DOP-14546] - add doctest to Version class * [DOP-14546] - split .digits logic into .min_digits and .format methods * [DOP-14546] - replace .min_digits to .format in _check_java_class_imported * Update onetl/connection/db_connection/kafka/connection.py Co-authored-by: Maxim Martynov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [DOP-14546] - add formatting * Update onetl/file/format/excel.py Co-authored-by: Maxim Martynov * Update onetl/file/format/excel.py Co-authored-by: Maxim Martynov * [DOP-14546] - add formatting * [DOP-14546] - add spark/scala formatting tests * Update tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py Co-authored-by: Maxim Martynov * [DOP-14546] - add spark/scala formatting tests --------- Co-authored-by: Maxim Martynov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/conf.py | 4 +- onetl/_util/hadoop.py | 2 +- onetl/_util/scala.py | 4 +- onetl/_util/spark.py | 4 +- onetl/_util/version.py | 264 ++++++++++-------- .../db_connection/greenplum/connection.py | 18 +- .../db_connection/jdbc_mixin/connection.py | 3 +- .../db_connection/kafka/connection.py | 12 +- .../db_connection/mongodb/connection.py | 12 +- .../db_connection/mssql/connection.py | 2 +- .../db_connection/oracle/connection.py | 2 +- .../file_df_connection/spark_s3/connection.py | 8 +- onetl/file/format/avro.py | 12 +- onetl/file/format/excel.py | 20 +- onetl/file/format/xml.py | 18 +- tests/fixtures/spark.py | 5 +- .../test_avro_integration.py | 5 +- .../test_csv_integration.py | 3 +- .../test_excel_integration.py | 7 +- .../test_xml_integration.py | 8 +- .../test_format_unit/test_avro_unit.py | 2 + .../test_format_unit/test_excel_unit.py | 2 + .../test_format_unit/test_xml_unit.py | 1 + .../test_greenplum_unit.py | 2 + .../test_kafka_unit.py | 1 + .../test_mongodb_unit.py | 2 + 26 files changed, 239 insertions(+), 184 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 188b1cdbd..dc4e425b7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,7 +16,7 @@ import sys from pathlib import Path -from packaging import version as Version +from packaging.version import Version PROJECT_ROOT_DIR = Path(__file__).parent.parent.resolve() @@ -34,7 +34,7 @@ # # The short X.Y version. -ver = Version.parse(subprocess.check_output("python ../setup.py --version", shell=True, text=True).strip()) +ver = Version(subprocess.check_output("python ../setup.py --version", shell=True, text=True).strip()) version = ver.base_version # The full version, including alpha/beta/rc tags. release = ver.public diff --git a/onetl/_util/hadoop.py b/onetl/_util/hadoop.py index 1faee4188..fdf275de7 100644 --- a/onetl/_util/hadoop.py +++ b/onetl/_util/hadoop.py @@ -17,7 +17,7 @@ def get_hadoop_version(spark_session: SparkSession) -> Version: jvm = spark_session._jvm # noqa: WPS437 version_info = jvm.org.apache.hadoop.util.VersionInfo # type: ignore[union-attr] hadoop_version: str = version_info.getVersion() - return Version.parse(hadoop_version) + return Version(hadoop_version) def get_hadoop_config(spark_session: SparkSession): diff --git a/onetl/_util/scala.py b/onetl/_util/scala.py index ec5d53fd8..397a91576 100644 --- a/onetl/_util/scala.py +++ b/onetl/_util/scala.py @@ -10,5 +10,5 @@ def get_default_scala_version(spark_version: Version) -> Version: Get default Scala version for specific Spark version """ if spark_version.major < 3: - return Version(2, 11) - return Version(2, 12) + return Version("2.11") + return Version("2.12") diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index 218c8d7de..230abe80e 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -63,14 +63,14 @@ def get_pyspark_version() -> Version: try_import_pyspark() import pyspark - return Version.parse(pyspark.__version__) + return Version(pyspark.__version__) def get_spark_version(spark_session: SparkSession) -> Version: """ Get Spark version from active Spark session """ - return Version.parse(spark_session.version) + return Version(spark_session.version) def get_executor_total_cores(spark_session: SparkSession, include_driver: bool = False) -> tuple[int | float, dict]: diff --git a/onetl/_util/version.py b/onetl/_util/version.py index 87583ee63..cb9bce7af 100644 --- a/onetl/_util/version.py +++ b/onetl/_util/version.py @@ -2,193 +2,231 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import re from functools import total_ordering -from string import ascii_lowercase -from typing import Iterator, NamedTuple, Sequence @total_ordering -class Version(NamedTuple): +class Version: """ Version representation. Examples -------- - >>> Version(5, 6, 7) - Version(major=5, minor=6, patch=7) - >>> Version(5, 6) - Version(major=5, minor=6, patch=None) - >>> Version(5) - Version(major=5, minor=None, patch=None) + >>> Version("12.3.1") + Version('12.3.1') + >>> Version("12.3") + Version('12.3') + >>> Version("12.3.4.5") + Version('12.3.4.5') + >>> Version("12.3.4-patch5") + Version('12.3.4-patch5') """ - major: int - minor: int | None = None - patch: int | None = None - - def __iter__(self) -> Iterator[int]: + def __init__(self, version: Version | str): + if isinstance(version, Version): + self._raw_str: str = version._raw_str + self._raw_parts: list[str] = version._raw_parts.copy() + self._numeric_parts: list[int] = version._numeric_parts.copy() + else: + self._raw_str = version + self._raw_parts = re.split("[.-]", version) + self._numeric_parts = [int(part) for part in self._raw_parts if part.isdigit()] + + @property + def major(self) -> int: """ - Iterate over version components which are not ``None``. + Return the major version component. Examples -------- - >>> for part in Version(5, 6, 7): - ... print(part) + >>> Version("5.6.7").major 5 - 6 - 7 + """ + return self._numeric_parts[0] if self._numeric_parts else 0 - >>> for part in Version(5, 6): - ... print(part) - 5 - 6 + @property + def minor(self) -> int: + """ + Return the minor version component. - >>> for part in Version(5): - ... print(part) - 5 + Examples + -------- + >>> Version("5.6.7").minor + 6 + >>> Version("5").minor + 0 """ - yield self.major - if self.minor is not None: - yield self.minor - if self.patch is not None: - yield self.patch + return self._numeric_parts[1] if len(self._numeric_parts) > 1 else 0 - def __len__(self): + @property + def patch(self) -> int: """ - Get number of components set. + Return the patch version component. Examples -------- - >>> assert len(Version(5, 6, 7)) == 3 - >>> assert len(Version(5, 6)) == 2 - >>> assert len(Version(5)) == 1 - + >>> Version("5.6.7").patch + 7 + >>> Version("5.6").patch + 0 """ - if self.patch is not None: - return 3 - if self.minor is not None: - return 2 - return 1 + return self._numeric_parts[2] if len(self._numeric_parts) > 2 else 0 - def __str__(self): + @property + def raw_parts(self) -> list[str]: """ - Get version as string. + Returns the parts of the version string as a list of substrings split by '.' or '-'. Examples -------- - - >>> assert str(Version(5, 6, 7)) == "5.6.7" - >>> assert str(Version(5, 6)) == "5.6" - >>> assert str(Version(5)) == "5" + >>> Version("1.2.3-alpha").raw_parts + ['1', '2', '3', 'alpha'] """ - return ".".join(map(str, self)) + return self._raw_parts - def __eq__(self, other): + def __getitem__(self, item): """ - Compare versions. + Allows direct access to the numeric parts of the version by index. Examples -------- + >>> Version("1.2.3")[0] + 1 + >>> Version("1.2.3")[1] + 2 + >>> Version("1.2.3")[2] + 3 + >>> Version("1.2.3-alpha")[3] + Traceback (most recent call last): + ... + IndexError: list index out of range + """ + return self._numeric_parts[item] - >>> assert Version(5, 6, 7) == Version(5, 6, 7) + def __len__(self): + """ + Get number of components set. - >>> # Version could be replaced with tuple[int, ...] - >>> assert Version(5, 6, 7) == (5, 6, 7) - >>> assert Version(5, 6) == (5, 6) - >>> assert Version(5) == (5,) + Examples + -------- + + >>> len(Version("5.6.7")) + 3 + >>> len(Version("5.6")) + 2 + >>> len(Version("5")) + 1 """ - if not isinstance(other, tuple): - return NotImplemented + return len(self._numeric_parts) - return tuple(self) == other + def __repr__(self): + return f"Version('{self._raw_str}')" - def __gt__(self, other): + def __str__(self): """ - Compare versions. + Return a string representation of the version. Examples -------- - >>> assert Version(5, 6, 7) > Version(5, 6, 6) - >>> assert not Version(5, 6, 7) > Version(5, 6, 7) - - >>> # Version could be replaced with tuple[int, ...] - >>> assert Version(5, 6, 7) > (5, 6) - >>> assert not Version(5, 6, 7) > (5, 7) + >>> str(Version("5.6.7")) + '5.6.7' + >>> str(Version("5.6")) + '5.6' + >>> str(Version("5.6.7.8")) + '5.6.7.8' + >>> str(Version("5.6.7-patch8")) + '5.6.7-patch8' - >>> assert Version(5, 6) > (5, 5) - >>> assert not Version(5, 6) > (5, 6) - >>> assert not Version(5, 6) > (5, 7) + """ + return self._raw_str - >>> assert Version(5, 6) > (5,) - >>> assert not Version(5, 6) > (6,) + def __eq__(self, other): + """ + Compare two versions for equality. - >>> assert Version(5) > (4,) - >>> assert not Version(5) > (5,) - >>> assert not Version(5) > (6,) + Examples + -------- + >>> Version("5.6.7") == Version("5.6.7") + True + >>> Version("5.6.7") == Version("5.6.8") + False """ - if not isinstance(other, tuple): + if not isinstance(other, Version): return NotImplemented + return self._numeric_parts == other._numeric_parts - return tuple(self) > other - - @classmethod - def parse(cls, version: int | float | str | Sequence) -> Version: + def __lt__(self, other: Version): """ - Parse input as version object. + Compare two versions using less than. Examples -------- - >>> assert Version.parse("5.6.7") == Version(5, 6, 7) - >>> assert Version.parse("5.6") == Version(5, 6) - >>> assert Version.parse("5") == Version(5) - - >>> assert Version.parse([5, 6, 7]) == Version(5, 6, 7) - >>> assert Version.parse([5, 6]) == Version(5, 6) - >>> assert Version.parse([5]) == Version(5) - - >>> assert Version.parse(5) == Version(5) - >>> assert Version.parse(5.0) == Version(5, 0) - + >>> Version("5.6.7") < Version("5.6.8") + True + >>> Version("5.6.9") < Version("5.6.8") + False """ - if isinstance(version, (int, float)): - version = str(version) - if isinstance(version, str): - version = version.split(".") - return cls(*map(int, version[:3])) + if not isinstance(other, Version): + return NotImplemented + return self._numeric_parts < other._numeric_parts - def digits(self, items: int) -> Version: + def min_digits(self, num_parts: int) -> Version: """ - Return version with exactly N components. + Ensure the version has at least a specified number of numeric components. Raises ------ - AssertionError + ValueError There is not enough components Examples -------- - - >>> assert Version(5, 6, 7).digits(3) == Version(5, 6, 7) - >>> assert Version(5, 6, 7).digits(2) == Version(5, 6) - >>> assert Version(5, 6, 7).digits(1) == Version(5) - >>> Version(5, 6).digits(3) - Traceback (most recent call last): - AssertionError: Version '5.6' does not match format 'a.b.c' - >>> Version(5).digits(2) + >>> Version("5.6.7").min_digits(3) + Version('5.6.7') + >>> Version("5.6.7").min_digits(2) + Version('5.6') + >>> Version("5.6").min_digits(3) Traceback (most recent call last): - AssertionError: Version '5' does not match format 'a.b' + ... + ValueError: Version '5.6' does not have enough numeric components for requested format. + """ + if len(self._numeric_parts) < num_parts: + raise ValueError(f"Version '{self}' does not have enough numeric components for requested format.") + truncated_parts = self._numeric_parts[:num_parts] + truncated_str = ".".join(str(part) for part in truncated_parts) + return Version(truncated_str) + + def format(self, format_string: str) -> str: + """ + Format the version using a custom format string. + + Examples + -------- + >>> v = Version("5.6.7") + >>> v.format("{major}.{minor}.{patch}") + '5.6.7' + >>> v.format("{0}.{1}.{2}") + '5.6.7' + >>> v.format("{0}.{1}.{2} - Complete Version") + '5.6.7 - Complete Version' + >>> v = Version("12.3.4-patch5") + >>> v.format("{major}.{minor}.{patch}") + '12.3.4' """ - if len(self) < items: - expected = ".".join(ascii_lowercase[:items]) - raise AssertionError(f"Version '{self}' does not match format '{expected}'") - return self.parse(tuple(self)[:items]) + return format_string.format( + *self._numeric_parts, + major=self.major, + minor=self.minor, + patch=self.patch, + ) diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 4460be149..431fd6022 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -204,24 +204,24 @@ def get_packages( # Connector version is fixed, so we can perform checks for Scala/Spark version if package_version: - package_ver = Version.parse(package_version) + package_ver = Version(package_version) else: - package_ver = Version(2, 2, 0) + package_ver = Version("2.2.0") if scala_version: - scala_ver = Version.parse(scala_version) + scala_ver = Version(scala_version).min_digits(2) elif spark_version: - spark_ver = Version.parse(spark_version) - if spark_ver.digits(2) > (3, 2) or spark_ver.digits(2) < (2, 3): + spark_ver = Version(spark_version).min_digits(2) + if spark_ver > Version("3.2") or spark_ver < Version("2.3"): raise ValueError(f"Spark version must be 2.3.x - 3.2.x, got {spark_ver}") scala_ver = get_default_scala_version(spark_ver) else: raise ValueError("You should pass either `scala_version` or `spark_version`") - if scala_ver.digits(2) < (2, 11) or scala_ver.digits(2) > (2, 12): - raise ValueError(f"Scala version must be 2.11 - 2.12, got {scala_ver}") + if scala_ver < Version("2.11") or scala_ver > Version("2.12"): + raise ValueError(f"Scala version must be 2.11 - 2.12, got {scala_ver.format('{0}.{1}')}") - return [f"io.pivotal:greenplum-spark_{scala_ver.digits(2)}:{package_ver.digits(3)}"] + return [f"io.pivotal:greenplum-spark_{scala_ver.format('{0}.{1}')}:{package_ver}"] @classproperty def package_spark_2_3(cls) -> str: @@ -387,7 +387,7 @@ def _check_java_class_imported(cls, spark): try: try_import_java_class(spark, java_class) except Exception as e: - spark_version = get_spark_version(spark).digits(2) + spark_version = get_spark_version(spark).format("{major}.{minor}") msg = MISSING_JVM_CLASS_MSG.format( java_class=java_class, package_source=cls.__name__, diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 76424c30c..856d387cf 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -17,6 +17,7 @@ from onetl._internal import clear_statement, stringify from onetl._util.java import get_java_gateway, try_import_java_class from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.connection.db_connection.jdbc_mixin.options import ( JDBCOptions as JDBCMixinOptions, ) @@ -521,7 +522,7 @@ def _resultset_to_dataframe(self, result_set) -> DataFrame: java_converters = self.spark._jvm.scala.collection.JavaConverters # type: ignore - if get_spark_version(self.spark) >= (3, 4): + if get_spark_version(self.spark) >= Version("3.4"): # https://github.com/apache/spark/commit/2349175e1b81b0a61e1ed90c2d051c01cf78de9b result_schema = jdbc_utils.getSchema(result_set, jdbc_dialect, False, False) # noqa: WPS425 else: diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 21a042451..ddc6637f5 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -424,17 +424,17 @@ def get_packages( """ # Connector version is same as Spark, do not perform any additional checks - spark_ver = Version.parse(spark_version) - if spark_ver < (2, 4): + spark_ver = Version(spark_version).min_digits(3) + if spark_ver < Version("2.4"): # Kafka connector for Spark 2.3 is build with Kafka client 0.10.0.1 which does not support # passing `sasl.jaas.config` option. It is supported only in 0.10.2.0, # see https://issues.apache.org/jira/browse/KAFKA-4259 # Old client requires generating JAAS file and placing it to filesystem, which is not secure. raise ValueError(f"Spark version must be at least 2.4, got {spark_ver}") - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) return [ - f"org.apache.spark:spark-sql-kafka-0-10_{scala_ver.digits(2)}:{spark_ver.digits(3)}", + f"org.apache.spark:spark-sql-kafka-0-10_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}", ] def __enter__(self): @@ -592,7 +592,7 @@ def _validate_addresses(cls, value, values): @validator("spark") def _check_spark_version(cls, spark): spark_version = get_spark_version(spark) - if spark_version < (2, 4): + if spark_version < Version("2.4"): raise ValueError(f"Spark version must be at least 2.4, got {spark_version}") return spark @@ -604,7 +604,7 @@ def _check_java_class_imported(cls, spark): try: try_import_java_class(spark, java_class) except Exception as e: - spark_version = get_spark_version(spark).digits(2) + spark_version = get_spark_version(spark).format("{major}.{minor}") msg = MISSING_JVM_CLASS_MSG.format( java_class=java_class, package_source=cls.__name__, diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index acaaa6539..51fc617fb 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -167,20 +167,20 @@ def get_packages( # Connector version is fixed, so we can perform checks for Scala/Spark version if scala_version: - scala_ver = Version.parse(scala_version) + scala_ver = Version(scala_version).min_digits(2) elif spark_version: - spark_ver = Version.parse(spark_version) + spark_ver = Version(spark_version) if spark_ver.major < 3: raise ValueError(f"Spark version must be at least 3.0, got {spark_ver}") scala_ver = get_default_scala_version(spark_ver) else: raise ValueError("You should pass either `scala_version` or `spark_version`") - if scala_ver.digits(2) < (2, 12) or scala_ver.digits(2) > (2, 13): - raise ValueError(f"Scala version must be 2.12 - 2.13, got {scala_ver}") + if scala_ver < Version("2.12") or scala_ver > Version("2.13"): + raise ValueError(f"Scala version must be 2.12 - 2.13, got {scala_ver.format('{0}.{1}')}") # https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector - return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.digits(2)}:10.1.1"] + return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.format('{0}.{1}')}:10.1.1"] @classproperty def package_spark_3_2(cls) -> str: @@ -512,7 +512,7 @@ def _check_java_class_imported(cls, spark): try: try_import_java_class(spark, java_class) except Exception as e: - spark_version = get_spark_version(spark).digits(2) + spark_version = get_spark_version(spark).format("{major}.{minor}") msg = MISSING_JVM_CLASS_MSG.format( java_class=java_class, package_source=cls.__name__, diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 608ef7521..1756146fa 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -173,7 +173,7 @@ def get_packages( if java_version is None: java_version = "8" - java_ver = Version.parse(java_version) + java_ver = Version(java_version) if java_ver.major < 8: raise ValueError(f"Java version must be at least 8, got {java_ver}") diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index f6b1ad2d9..f239b85b2 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -203,7 +203,7 @@ def get_packages( if java_version is None: java_version = "8" - java_ver = Version.parse(java_version) + java_ver = Version(java_version) if java_ver.major < 8: raise ValueError(f"Java version must be at least 8, got {java_ver}") diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 46ce8bb33..44b1e2355 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -260,14 +260,14 @@ def get_packages( """ - spark_ver = Version.parse(spark_version) + spark_ver = Version(spark_version).min_digits(3) if spark_ver.major < 3: # https://issues.apache.org/jira/browse/SPARK-23977 raise ValueError(f"Spark version must be at least 3.x, got {spark_ver}") - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) # https://mvnrepository.com/artifact/org.apache.spark/spark-hadoop-cloud - return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.digits(2)}:{spark_ver.digits(3)}"] + return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}"] @slot def path_from_string(self, path: os.PathLike | str) -> RemotePath: @@ -365,7 +365,7 @@ def _check_java_class_imported(cls, spark: SparkSession) -> SparkSession: try: try_import_java_class(spark, java_class) except Exception as e: - spark_version = get_spark_version(spark).digits(3) + spark_version = get_spark_version(spark).format("{major}.{minor}.{patch}") msg = MISSING_JVM_CLASS_MSG.format( java_class=java_class, package_source=cls.__name__, diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 551c76c9b..94fad71dd 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -148,15 +148,15 @@ def get_packages( """ - spark_ver = Version.parse(spark_version) - if spark_ver < (2, 4): + spark_ver = Version(spark_version).min_digits(3) + if spark_ver < Version("2.4"): raise ValueError(f"Spark version should be at least 2.4, got {spark_version}") - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) - if scala_ver.digits(2) < (2, 11): - raise ValueError(f"Scala version should be at least 2.11, got {scala_ver}") + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) + if scala_ver < Version("2.11"): + raise ValueError(f"Scala version should be at least 2.11, got {scala_ver.format('{0}.{1}')}") - return [f"org.apache.spark:spark-avro_{scala_ver.digits(2)}:{spark_ver.digits(3)}"] + return [f"org.apache.spark:spark-avro_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}"] @slot def check_if_supported(self, spark: SparkSession) -> None: diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index a7342f702..ffbaf9173 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -167,28 +167,30 @@ def get_packages( """ if package_version: - version = Version.parse(package_version) - if version < (0, 15): + version = Version(package_version) + if version < Version("0.15"): # format="com.crealytics.spark.excel" does not support reading folder with files # format="excel" was added only in 0.14, but Maven package for 0.14 has different naming convention than recent versions. # So using 0.15 as the lowest supported version. raise ValueError(f"Package version should be at least 0.15, got {package_version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version(0, 20, 3) + version = Version("0.20.3") - spark_ver = Version.parse(spark_version) - if spark_ver < (3, 2): + spark_ver = Version(spark_version).min_digits(3) + if spark_ver < Version("3.2"): # Actually, Spark 2.4 is supported, but packages are built only for Scala 2.12 # when default pyspark==2.4.1 is built with Scala 2.11. # See https://github.com/crealytics/spark-excel/issues/426 raise ValueError(f"Spark version should be at least 3.2, got {spark_version}") - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) - if scala_ver.digits(2) < (2, 12): - raise ValueError(f"Scala version should be at least 2.12, got {scala_ver}") + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) + if scala_ver < Version("2.12"): + raise ValueError(f"Scala version should be at least 2.12, got {scala_ver.format('{0}.{1}')}") - return [f"com.crealytics:spark-excel_{scala_ver.digits(2)}:{spark_ver.digits(3)}_{version.digits(3)}"] + return [ + f"com.crealytics:spark-excel_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}_{version}", + ] @slot def check_if_supported(self, spark: SparkSession) -> None: diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 47190ea77..2c3a92cdf 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -191,24 +191,24 @@ def get_packages( # noqa: WPS231 """ if package_version: - version = Version.parse(package_version) - if version < (0, 14): + version = Version(package_version).min_digits(3) + if version < Version("0.14"): raise ValueError(f"Package version must be above 0.13, got {version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version(0, 17, 0) + version = Version("0.17.0").min_digits(3) - spark_ver = Version.parse(spark_version) - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + spark_ver = Version(spark_version) + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) # Ensure compatibility with Spark and Scala versions - if spark_ver < (3, 0): + if spark_ver < Version("3.0"): raise ValueError(f"Spark version must be 3.x, got {spark_ver}") - if scala_ver < (2, 12) or scala_ver > (2, 13): - raise ValueError(f"Scala version must be 2.12 or 2.13, got {scala_ver}") + if scala_ver < Version("2.12") or scala_ver > Version("2.13"): + raise ValueError(f"Scala version must be 2.12 or 2.13, got {scala_ver.format('{0}.{1}')}") - return [f"com.databricks:spark-xml_{scala_ver.digits(2)}:{version.digits(3)}"] + return [f"com.databricks:spark-xml_{scala_ver.format('{0}.{1}')}:{version}"] @slot def check_if_supported(self, spark: SparkSession) -> None: diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index b2559d520..241f87fc3 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -5,6 +5,7 @@ import pytest from onetl._util.spark import get_pyspark_version +from onetl._util.version import Version @pytest.fixture(scope="session") @@ -66,13 +67,13 @@ def maven_packages(): ), ) - if pyspark_version >= (2, 4): + if pyspark_version >= Version("2.4"): # There is no Avro package for Spark 2.3 packages.extend(Avro.get_packages(spark_version=pyspark_version)) # Kafka connector for Spark 2.3 is too old and not supported packages.extend(Kafka.get_packages(spark_version=pyspark_version)) - if pyspark_version >= (3, 2): + if pyspark_version >= Version("3.2"): # There is no SparkS3 connector for Spark less than 3 packages.extend(SparkS3.get_packages(spark_version=pyspark_version)) diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py index eaffd6499..d9ada7bda 100644 --- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py @@ -7,6 +7,7 @@ import pytest from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import Avro @@ -54,7 +55,7 @@ def test_avro_reader( ): """Reading Avro files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (2, 4): + if spark_version < Version("2.4"): pytest.skip("Avro files are supported on Spark 3.2+ only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -91,7 +92,7 @@ def test_avro_writer( ): """Written files can be read by Spark""" spark_version = get_spark_version(spark) - if spark_version < (2, 4): + if spark_version < Version("2.4"): pytest.skip("Avro files are supported on Spark 3.2+ only") file_df_connection, source_path = local_fs_file_df_connection_with_path diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py index 5dbfd20e1..56cb8f052 100644 --- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py @@ -7,6 +7,7 @@ import pytest from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import CSV @@ -46,7 +47,7 @@ def test_csv_reader_with_infer_schema( if spark_version.major < 3: # Spark 2 infers "date_value" as timestamp instead of date expected_df = df.withColumn("date_value", col("date_value").cast("timestamp")) - elif spark_version < (3, 3): + elif spark_version < Version("3.3"): # Spark 3.2 cannot infer "date_value", and return it as string expected_df = df.withColumn("date_value", col("date_value").cast("string")) diff --git a/tests/tests_integration/test_file_format_integration/test_excel_integration.py b/tests/tests_integration/test_file_format_integration/test_excel_integration.py index 9228abd3d..8344de1db 100644 --- a/tests/tests_integration/test_file_format_integration/test_excel_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_excel_integration.py @@ -7,6 +7,7 @@ import pytest from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import Excel @@ -30,7 +31,7 @@ def test_excel_reader_with_infer_schema( ): """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (3, 2): + if spark_version < Version("3.2"): pytest.skip("Excel files are supported on Spark 3.2+ only") file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -79,7 +80,7 @@ def test_excel_reader_with_options( ): """Reading Excel files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (3, 2): + if spark_version < Version("3.2"): pytest.skip("Excel files are supported on Spark 3.2+ only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -115,7 +116,7 @@ def test_excel_writer( ): """Written files can be read by Spark""" spark_version = get_spark_version(spark) - if spark_version < (3, 2): + if spark_version < Version("3.2"): pytest.skip("Excel files are supported on Spark 3.2+ only") file_df_connection, source_path = local_fs_file_df_connection_with_path diff --git a/tests/tests_integration/test_file_format_integration/test_xml_integration.py b/tests/tests_integration/test_file_format_integration/test_xml_integration.py index 2be9d33a4..81aade061 100644 --- a/tests/tests_integration/test_file_format_integration/test_xml_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_xml_integration.py @@ -43,7 +43,7 @@ def test_xml_reader( ): """Reading XML files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (3, 0): + if spark_version.major < 3: pytest.skip("XML files are supported on Spark 3.x only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -70,7 +70,7 @@ def test_xml_reader_with_infer_schema( ): """Reading XML files with inferSchema=True working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (3, 0): + if spark_version.major < 3: pytest.skip("XML files are supported on Spark 3.x only") file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -108,7 +108,7 @@ def test_xml_writer( ): """Written files can be read by Spark""" spark_version = get_spark_version(spark) - if spark_version < (3, 0): + if spark_version.major < 3: pytest.skip("XML files are supported on Spark 3.x only") file_df_connection, source_path = local_fs_file_df_connection_with_path @@ -150,7 +150,7 @@ def test_xml_reader_with_attributes( ): """Reading XML files with attributes works as expected""" spark_version = get_spark_version(spark) - if spark_version < (3, 0): + if spark_version.major < 3: pytest.skip("XML files are supported on Spark 3.x only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files diff --git a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py index 97cb42dc8..081ed478f 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py @@ -33,6 +33,8 @@ def test_avro_get_packages_scala_version_not_supported(): ("2.4.0", "2.12", "org.apache.spark:spark-avro_2.12:2.4.0"), ("3.5.0", "2.12", "org.apache.spark:spark-avro_2.12:3.5.0"), ("3.5.0", "2.13", "org.apache.spark:spark-avro_2.13:3.5.0"), + # Scala version contain three digits when only two needed + ("3.5.0", "2.12.1", "org.apache.spark:spark-avro_2.12:3.5.0"), ], ) def test_avro_get_packages(spark_version, scala_version, package): diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py index 9424e8b1c..c99b375fd 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py @@ -42,6 +42,8 @@ def test_excel_get_packages_package_version_not_supported(): # Override package version ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]), ("3.5.0", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.5.0_0.18.0"]), + # Scala version contain three digits when only two needed + ("3.5.0", "2.12.1", None, ["com.crealytics:spark-excel_2.12:3.5.0_0.20.3"]), ], ) def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages): diff --git a/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py b/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py index 17d9dac2a..bd98c0ff5 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py @@ -20,6 +20,7 @@ ("3.4.1", "2.13", "0.18.0", ["com.databricks:spark-xml_2.13:0.18.0"]), ("3.3.0", None, "0.16.0", ["com.databricks:spark-xml_2.12:0.16.0"]), ("3.3.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.2.4", "2.12.1", "0.15.0", ["com.databricks:spark-xml_2.12:0.15.0"]), ], ) def test_xml_get_packages(spark_version, scala_version, package_version, expected_packages): diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index bda06d282..c1cb13804 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -65,6 +65,8 @@ def test_greenplum_get_packages_scala_version_not_supported(scala_version): # Override Scala version detected automatically ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.2.0"), ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), + # Scala version contain three digits when only two needed + ("3.2.4", "2.12.1", "io.pivotal:greenplum-spark_2.12:2.2.0"), ], ) def test_greenplum_get_packages(spark_version, scala_version, package): diff --git a/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py b/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py index a97212560..2e0ccd1a0 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py @@ -42,6 +42,7 @@ def create_temp_file(tmp_path_factory): ("3.3.0", None, "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0"), ("3.3.0", "2.12", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0"), ("3.3.0", "2.13", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.3.0"), + ("3.3.1", "2.12.2", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1"), ], ) def test_kafka_get_packages(spark_version, scala_version, package): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index 48882f185..c7e9e42b7 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -61,6 +61,8 @@ def test_mongodb_get_packages_scala_version_not_supported(scala_version): # Override Scala version detected automatically ("3.2", "2.12", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), ("3.4", "2.13", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), + # Scala version contain three digits when only two needed + ("3.2.4", "2.12.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), ], ) def test_mongodb_get_packages(spark_version, scala_version, package): From 702400e82abcba3a79d06ad0aaf9cd9751c941fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 15 Apr 2024 13:05:40 +0000 Subject: [PATCH 11/71] Add pysmb to onetl[files] extra --- docs/changelog/next_release/+.bugfix.rst | 1 + setup.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/next_release/+.bugfix.rst diff --git a/docs/changelog/next_release/+.bugfix.rst b/docs/changelog/next_release/+.bugfix.rst new file mode 100644 index 000000000..0b9a0db76 --- /dev/null +++ b/docs/changelog/next_release/+.bugfix.rst @@ -0,0 +1 @@ +Fix missing ``pysmb`` package after installing ``pip install onetl[files]`` . diff --git a/setup.py b/setup.py index d428bd826..c7ce5d0dc 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,14 @@ def parse_requirements(file: Path) -> list[str]: requirements_hdfs = parse_requirements(here / "requirements" / "hdfs.txt") requirements_s3 = parse_requirements(here / "requirements" / "s3.txt") requirements_webdav = parse_requirements(here / "requirements" / "webdav.txt") -requirements_files = [*requirements_ftp, *requirements_sftp, *requirements_hdfs, *requirements_s3, *requirements_webdav] +requirements_files = [ + *requirements_ftp, + *requirements_sftp, + *requirements_hdfs, + *requirements_s3, + *requirements_webdav, + *requirements_samba, +] requirements_kerberos = parse_requirements(here / "requirements" / "kerberos.txt") requirements_spark = parse_requirements(here / "requirements" / "spark.txt") From 5b370bc24a1f7439b9da195b78426954b84bbaa1 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Mon, 15 Apr 2024 16:59:44 +0300 Subject: [PATCH 12/71] [DOP-13855] - update clickhouse driver (#249) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [DOP-13855] - update clickhouse driver * [DOP-13855] - update clickhouse get_packages() tests * [DOP-13855] Fix Clickhouse HWM tests * [DOP-13855] - update clickhouse get_packages() tests * [DOP-13855] Fix Clickhouse HWM tests * [DOP-13855] Fix Clickhouse HWM tests * [DOP-13855] - update clickhouse get_packages() tests * [DOP-13855] - fix Version.min_digits() method * [DOP-13855] - update numeric mapping * [DOP-13855] - update temporal mapping --------- Co-authored-by: Мартынов Максим Сергеевич --- docs/changelog/next_release/249.breaking.rst | 1 + docs/changelog/next_release/249.feature.rst | 1 + .../db_connection/clickhouse/types.rst | 57 ++++++------- onetl/VERSION | 2 +- onetl/_util/version.py | 12 +-- .../db_connection/clickhouse/connection.py | 51 +++++++++--- .../db_connection/greenplum/connection.py | 2 +- tests/fixtures/processing/clickhouse.py | 16 ++++ .../test_clickhouse_integration.py | 4 +- .../test_clickhouse_unit.py | 80 ++++++++++++++++--- .../test_greenplum_unit.py | 2 +- tests/util/assert_df.py | 2 +- 12 files changed, 166 insertions(+), 64 deletions(-) create mode 100644 docs/changelog/next_release/249.breaking.rst create mode 100644 docs/changelog/next_release/249.feature.rst diff --git a/docs/changelog/next_release/249.breaking.rst b/docs/changelog/next_release/249.breaking.rst new file mode 100644 index 000000000..04764a1e6 --- /dev/null +++ b/docs/changelog/next_release/249.breaking.rst @@ -0,0 +1 @@ +Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0 `_. diff --git a/docs/changelog/next_release/249.feature.rst b/docs/changelog/next_release/249.feature.rst new file mode 100644 index 000000000..7d79a6c87 --- /dev/null +++ b/docs/changelog/next_release/249.feature.rst @@ -0,0 +1 @@ +Allow passing custom JDBC driver version to ``Clickhouse.get_packages(package_version=...)``. diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 1df369dd4..b0171b503 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -125,11 +125,9 @@ Numeric types ~~~~~~~~~~~~~ +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +================================+===================================+===============================+===============================+ -| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``-`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | +| ``Bool`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal`` | ``DecimalType(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -147,11 +145,9 @@ Numeric types +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal256(S=0..76)`` | unsupported [3]_ | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | -+--------------------------------+ | | | -| ``Float64`` | | | | +| ``Float32`` | ``FloatType()`` | ``Float32`` | ``Float32`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``-`` | ``FloatType()`` | ``Float32`` | ``Float32`` | +| ``Float64`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | +--------------------------------+ | | | @@ -161,7 +157,7 @@ Numeric types +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int64`` | ``LongType()`` | ``Int64`` | ``Int64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | +| ``Int128`` | unsupported [3]_ | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int256`` | unsupported [3]_ | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -170,16 +166,16 @@ Numeric types | ``-`` | ``ShortType()`` | ``Int32`` | ``Int32`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------+ | | | -| ``UInt16`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt16`` | ``LongType()`` | ``Int64`` | ``Int64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | +--------------------------------+ | | | | ``UInt64`` | | | | -+--------------------------------+ | | | -| ``UInt128`` | | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``UInt256`` | unsupported [3]_ | | | +| ``UInt128`` | unsupported [3]_ | | | ++--------------------------------+ | | | +| ``UInt256`` | | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ .. [3] @@ -198,33 +194,32 @@ Notes: * ``TIMESTAMP`` is alias for ``DateTime32``, but ``TIMESTAMP(N)`` is alias for ``DateTime64(N)`` +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +===================================+======================================+==================================+===============================+ | ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``Date32`` | unsupported | | | +| ``Date32`` | ``DateType()`` | ``Date`` | ``Date`` | +| | | | **cannot be inserted** [6]_ | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime32``, seconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | -+-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime64(3)``, milliseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | -| | | | **precision loss** [5]_ | -| | | | | -+-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime64(6)``, microseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | -+-----------------------------------+--------------------------------------+----------------------------------+ **cannot be inserted** [6]_ | -| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, microseconds, | ``DateTime64(6)``, microseconds, | | -| | **precision loss** [4]_ | **precision loss** [4]_ | | -| | | | | +| ``DateTime32``, seconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | ``DateTime32`` | ++-----------------------------------+--------------------------------------+----------------------------------+ seconds | +| ``DateTime64(3)``, milliseconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | **precision loss** [4]_ | ++-----------------------------------+--------------------------------------+----------------------------------+ | +| ``DateTime64(6)``, microseconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | | +-----------------------------------+--------------------------------------+----------------------------------+ | -| ``-`` | ``TimestampNTZType()``, microseconds | ``DateTime64(6)`` | | +| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()`` | ``DateTime64(6)`` | | +| | | microseconds | | +| | | **precision loss** [4]_ | | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``IntervalNanosecond`` | unsupported | | | +| ``-`` | ``TimestampNTZType()`` | ``DateTime64(6)`` | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalNanosecond`` | ``LongType()`` | ```Int64`` | ``Int64`` | +-----------------------------------+ | | | | ``IntervalMicrosecond`` | | | | +-----------------------------------+ | | | | ``IntervalMillisecond`` | | | | -+-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++-----------------------------------+ | | | +| ``IntervalSecond`` | | | | +-----------------------------------+ | | | | ``IntervalMinute`` | | | | +-----------------------------------+ | | | diff --git a/onetl/VERSION b/onetl/VERSION index a3f5a8ed4..d9df1bbc0 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.10.3 +0.11.0 diff --git a/onetl/_util/version.py b/onetl/_util/version.py index cb9bce7af..24b924da2 100644 --- a/onetl/_util/version.py +++ b/onetl/_util/version.py @@ -195,17 +195,17 @@ def min_digits(self, num_parts: int) -> Version: >>> Version("5.6.7").min_digits(3) Version('5.6.7') >>> Version("5.6.7").min_digits(2) - Version('5.6') + Version('5.6.7') >>> Version("5.6").min_digits(3) Traceback (most recent call last): ... - ValueError: Version '5.6' does not have enough numeric components for requested format. + ValueError: Version '5.6' does not have enough numeric components for requested format (expected at least 3). """ if len(self._numeric_parts) < num_parts: - raise ValueError(f"Version '{self}' does not have enough numeric components for requested format.") - truncated_parts = self._numeric_parts[:num_parts] - truncated_str = ".".join(str(part) for part in truncated_parts) - return Version(truncated_str) + raise ValueError( + f"Version '{self}' does not have enough numeric components for requested format (expected at least {num_parts}).", + ) + return self def format(self, format_string: str) -> str: """ diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 88de5645a..fa06d4daa 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -3,10 +3,10 @@ from __future__ import annotations import logging -import warnings from typing import ClassVar, Optional from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.clickhouse.dialect import ClickhouseDialect from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_mixin import JDBCStatementType @@ -28,7 +28,7 @@ class Config: class Clickhouse(JDBCConnection): """Clickhouse JDBC connection. |support_hooks| - Based on Maven package ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` + Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0 `_ (`official Clickhouse JDBC driver `_). .. warning:: @@ -104,14 +104,26 @@ class Clickhouse(JDBCConnection): Extra = ClickhouseExtra Dialect = ClickhouseDialect - DRIVER: ClassVar[str] = "ru.yandex.clickhouse.ClickHouseDriver" + DRIVER: ClassVar[str] = "com.clickhouse.jdbc.ClickHouseDriver" @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages( + cls, + package_version: str | None = None, + apache_http_client_version: str | None = None, + ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| + Parameters + ---------- + package_version : str , optional + ClickHouse JDBC version client packages. Defaults to ``0.6.0``. + + apache_http_client_version : str, optional + Apache HTTP Client version package. Defaults to ``5.3.1``. + Examples -------- @@ -119,17 +131,34 @@ def get_packages(cls) -> list[str]: from onetl.connection import Clickhouse - Clickhouse.get_packages() + Clickhouse.get_packages(package_version="0.6.0", apache_http_client_version="5.3.1") + + .. note:: + + Spark does not support ``.jar`` classifiers, so it is not possible to pass + ``com.clickhouse:clickhouse-jdbc:0.6.0:all`` to install all required packages. """ - return ["ru.yandex.clickhouse:clickhouse-jdbc:0.3.2"] + package_version_obj = Version(package_version).min_digits(3) if package_version else Version("0.6.0") + apache_http_client_version_obj = ( + Version(apache_http_client_version).min_digits(3) if apache_http_client_version else Version("5.3.1") + ) + + result = [ + f"com.clickhouse:clickhouse-jdbc:{package_version_obj}", + f"com.clickhouse:clickhouse-http-client:{package_version_obj}", + ] + + if package_version_obj >= Version("0.5.0"): + # before 0.5.0 builtin Java HTTP Client was used + result.append(f"org.apache.httpcomponents.client5:httpclient5:{apache_http_client_version_obj}") + + return result @classproperty - def package(cls) -> str: - """Get package name to be downloaded by Spark.""" - msg = "`Clickhouse.package` will be removed in 1.0.0, use `Clickhouse.get_packages()` instead" - warnings.warn(msg, UserWarning, stacklevel=3) - return "ru.yandex.clickhouse:clickhouse-jdbc:0.3.2" + def package(self) -> str: + """Get a single string of package names to be downloaded by Spark for establishing a Clickhouse connection.""" + return "com.clickhouse:clickhouse-jdbc:0.6.0,com.clickhouse:clickhouse-http-client:0.6.0,org.apache.httpcomponents.client5:httpclient5:5.3.1" @property def jdbc_url(self) -> str: diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 431fd6022..6744816f8 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -212,7 +212,7 @@ def get_packages( scala_ver = Version(scala_version).min_digits(2) elif spark_version: spark_ver = Version(spark_version).min_digits(2) - if spark_ver > Version("3.2") or spark_ver < Version("2.3"): + if spark_ver >= Version("3.3") or spark_ver < Version("2.3"): raise ValueError(f"Spark version must be 2.3.x - 3.2.x, got {spark_ver}") scala_ver = get_default_scala_version(spark_ver) else: diff --git a/tests/fixtures/processing/clickhouse.py b/tests/fixtures/processing/clickhouse.py index 1205fad6a..2b3e4cec1 100644 --- a/tests/fixtures/processing/clickhouse.py +++ b/tests/fixtures/processing/clickhouse.py @@ -152,3 +152,19 @@ def get_expected_dataframe( order_by: str | None = None, ) -> pandas.DataFrame: return self.connection.query_dataframe(self.get_expected_dataframe_ddl(schema, table, order_by)) + + def fix_pandas_df( + self, + df: pandas.DataFrame, + ) -> pandas.DataFrame: + df = super().fix_pandas_df(df) + + for column in df.columns: + column_name = column.lower() + + if "float" in column_name: + # somethere in chain Clickhouse -> Spark -> Pandas Float32 column is being converted to Float64, + # causing tests to fail. Convert it back to original type + df[column] = df[column].astype("float32") + + return df diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py index 73d8abfb2..c786b1fe0 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py @@ -227,9 +227,7 @@ def table_finalizer(): updated_df = pandas.concat([updated_rows, unchanged_rows]) processing.assert_equal_df(df=df, other_frame=updated_df, order_by="id_int") - # not supported by Clickhouse - with pytest.raises(Exception): - clickhouse.execute(f"UPDATE {temp_table} SET hwm_int = 1 WHERE id_int < 50{suffix}") + clickhouse.execute(f"UPDATE {temp_table} SET hwm_int = 1 WHERE id_int < 50{suffix}") clickhouse.execute(f"ALTER TABLE {temp_table} DELETE WHERE id_int < 70{suffix}") df = clickhouse.fetch(f"SELECT * FROM {temp_table}{suffix}") diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 42b5582ae..15388ee7e 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -1,5 +1,3 @@ -import re - import pytest from onetl.connection import Clickhouse @@ -8,21 +6,85 @@ def test_clickhouse_driver(): - assert Clickhouse.DRIVER == "ru.yandex.clickhouse.ClickHouseDriver" + assert Clickhouse.DRIVER == "com.clickhouse.jdbc.ClickHouseDriver" def test_clickhouse_package(): - warning_msg = re.escape("will be removed in 1.0.0, use `Clickhouse.get_packages()` instead") - with pytest.warns(UserWarning, match=warning_msg): - assert Clickhouse.package == "ru.yandex.clickhouse:clickhouse-jdbc:0.3.2" + expected_packages = "com.clickhouse:clickhouse-jdbc:0.6.0,com.clickhouse:clickhouse-http-client:0.6.0,org.apache.httpcomponents.client5:httpclient5:5.3.1" + assert Clickhouse.package == expected_packages + + +@pytest.mark.parametrize( + "package_version, apache_http_client_version, expected_packages", + [ + ( + None, + None, + [ + "com.clickhouse:clickhouse-jdbc:0.6.0", + "com.clickhouse:clickhouse-http-client:0.6.0", + "org.apache.httpcomponents.client5:httpclient5:5.3.1", + ], + ), + ( + "0.6.0-patch3", + "5.3.1", + [ + "com.clickhouse:clickhouse-jdbc:0.6.0-patch3", + "com.clickhouse:clickhouse-http-client:0.6.0-patch3", + "org.apache.httpcomponents.client5:httpclient5:5.3.1", + ], + ), + ( + "0.4.0", + "4.5.14", + ["com.clickhouse:clickhouse-jdbc:0.4.0", "com.clickhouse:clickhouse-http-client:0.4.0"], + ), # No HTTP client should be included + ( + "0.5.0", + "4.5.14", + [ + "com.clickhouse:clickhouse-jdbc:0.5.0", + "com.clickhouse:clickhouse-http-client:0.5.0", + "org.apache.httpcomponents.client5:httpclient5:4.5.14", + ], + ), + ( + "0.6.0", + "4.5.14", + [ + "com.clickhouse:clickhouse-jdbc:0.6.0", + "com.clickhouse:clickhouse-http-client:0.6.0", + "org.apache.httpcomponents.client5:httpclient5:4.5.14", + ], + ), + ], +) +def test_clickhouse_get_packages(package_version, apache_http_client_version, expected_packages): + assert ( + Clickhouse.get_packages(package_version=package_version, apache_http_client_version=apache_http_client_version) + == expected_packages + ) -def test_clickhouse_get_packages(): - assert Clickhouse.get_packages() == ["ru.yandex.clickhouse:clickhouse-jdbc:0.3.2"] +@pytest.mark.parametrize( + "package_version, apache_http_client_version", + [ + ("0.7", "5.3.1"), + ("1", "5.4.0"), + ("a.b.c", "5.3.1"), + ], +) +def test_invalid_versions_raise_error(package_version, apache_http_client_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", + ): + Clickhouse.get_packages(package_version=package_version, apache_http_client_version=apache_http_client_version) def test_clickhouse_missing_package(spark_no_packages): - msg = "Cannot import Java class 'ru.yandex.clickhouse.ClickHouseDriver'" + msg = "Cannot import Java class 'com.clickhouse.jdbc.ClickHouseDriver'" with pytest.raises(ValueError, match=msg): Clickhouse( host="some_host", diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index c1cb13804..de24e5ce2 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -66,7 +66,7 @@ def test_greenplum_get_packages_scala_version_not_supported(scala_version): ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.2.0"), ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), # Scala version contain three digits when only two needed - ("3.2.4", "2.12.1", "io.pivotal:greenplum-spark_2.12:2.2.0"), + ("3.2.4", "2.11.1", "io.pivotal:greenplum-spark_2.11:2.2.0"), ], ) def test_greenplum_get_packages(spark_version, scala_version, package): diff --git a/tests/util/assert_df.py b/tests/util/assert_df.py index f7adad032..e36c9af73 100644 --- a/tests/util/assert_df.py +++ b/tests/util/assert_df.py @@ -67,4 +67,4 @@ def assert_subset_df( for column in columns: # noqa: WPS528 difference = ~small_pdf[column].isin(large_pdf[column]) - assert not difference.all(), large_pdf[difference] + assert not difference.all(), small_pdf[difference] From 78bad0a6bfa24ca81401eea1993edc16c8834f94 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Apr 2024 21:15:01 +0000 Subject: [PATCH 13/71] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.3.0 → 24.4.0](https://github.com/psf/black/compare/24.3.0...24.4.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d2c6ff79..a5aa7cb60 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -96,7 +96,7 @@ repos: args: [--py37-plus, --keep-runtime-typing] - repo: https://github.com/psf/black - rev: 24.3.0 + rev: 24.4.0 hooks: - id: black language_version: python3 From bd2f8245de7ae97124d787fea8561b4fc0f9c696 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Tue, 16 Apr 2024 14:44:59 +0300 Subject: [PATCH 14/71] [DOP-13849] - upgrade Postgres packages (#251) * [DOP-13849] - upgrade postgres packages * [DOP-13849] - put away Version annotations --- docs/changelog/next_release/249.feature.rst | 2 +- docs/changelog/next_release/251.feature.rst | 1 + .../db_connection/clickhouse/connection.py | 22 +++++----- .../db_connection/postgres/connection.py | 22 +++++++--- .../test_clickhouse_unit.py | 2 +- .../test_postgres_unit.py | 40 +++++++++++++++---- 6 files changed, 63 insertions(+), 26 deletions(-) create mode 100644 docs/changelog/next_release/251.feature.rst diff --git a/docs/changelog/next_release/249.feature.rst b/docs/changelog/next_release/249.feature.rst index 7d79a6c87..8ec0686c2 100644 --- a/docs/changelog/next_release/249.feature.rst +++ b/docs/changelog/next_release/249.feature.rst @@ -1 +1 @@ -Allow passing custom JDBC driver version to ``Clickhouse.get_packages(package_version=...)``. +:class:`Clickhouse` connection supports passing custom versions: ``Clickhouse.get_packages(package_version=...)``. diff --git a/docs/changelog/next_release/251.feature.rst b/docs/changelog/next_release/251.feature.rst new file mode 100644 index 000000000..bc8d528a2 --- /dev/null +++ b/docs/changelog/next_release/251.feature.rst @@ -0,0 +1 @@ +:class:`Postgres` connection now uses PostgreSQL JDBC driver ``42.7.3``, upgraded from ``42.6.0``, and supports passing custom versions: ``Postgres.get_packages(package_version=...)``. diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index fa06d4daa..288612f3c 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -114,11 +114,11 @@ def get_packages( apache_http_client_version: str | None = None, ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying custom JDBC and Apache HTTP Client versions. |support_hooks| Parameters ---------- - package_version : str , optional + package_version : str, optional ClickHouse JDBC version client packages. Defaults to ``0.6.0``. apache_http_client_version : str, optional @@ -139,19 +139,19 @@ def get_packages( ``com.clickhouse:clickhouse-jdbc:0.6.0:all`` to install all required packages. """ - package_version_obj = Version(package_version).min_digits(3) if package_version else Version("0.6.0") - apache_http_client_version_obj = ( - Version(apache_http_client_version).min_digits(3) if apache_http_client_version else Version("5.3.1") - ) + default_jdbc_version = "0.6.0" + default_http_version = "5.3.1" + + jdbc_version = Version(package_version or default_jdbc_version).min_digits(3) + http_version = Version(apache_http_client_version or default_http_version).min_digits(3) result = [ - f"com.clickhouse:clickhouse-jdbc:{package_version_obj}", - f"com.clickhouse:clickhouse-http-client:{package_version_obj}", + f"com.clickhouse:clickhouse-jdbc:{jdbc_version}", + f"com.clickhouse:clickhouse-http-client:{jdbc_version}", ] - if package_version_obj >= Version("0.5.0"): - # before 0.5.0 builtin Java HTTP Client was used - result.append(f"org.apache.httpcomponents.client5:httpclient5:{apache_http_client_version_obj}") + if jdbc_version >= Version("0.5.0"): + result.append(f"org.apache.httpcomponents.client5:httpclient5:{http_version}") return result diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index a4d27bcea..80cddbc11 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -6,6 +6,7 @@ from typing import ClassVar from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions from onetl.connection.db_connection.postgres.dialect import PostgresDialect @@ -27,7 +28,7 @@ class Config: class Postgres(JDBCConnection): """PostgreSQL JDBC connection. |support_hooks| - Based on Maven package ``org.postgresql:postgresql:42.6.0`` + Based on Maven package ``org.postgresql:postgresql:42.7.3`` (`official Postgres JDBC driver `_). .. warning:: @@ -105,9 +106,14 @@ class Postgres(JDBCConnection): @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages(cls, package_version: str | None = None) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying a custom JDBC driver version. |support_hooks| + + Parameters + ---------- + package_version : str, optional + Specifies the version of the PostgreSQL JDBC driver to use. Defaults to ``42.7.3``. Examples -------- @@ -118,15 +124,21 @@ def get_packages(cls) -> list[str]: Postgres.get_packages() + # custom package version + Postgres.get_packages(package_version="42.6.0") + """ - return ["org.postgresql:postgresql:42.6.0"] + default_version = "42.7.3" + version = Version(package_version or default_version).min_digits(3) + + return [f"org.postgresql:postgresql:{version}"] @classproperty def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`Postgres.package` will be removed in 1.0.0, use `Postgres.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "org.postgresql:postgresql:42.6.0" + return "org.postgresql:postgresql:42.7.3" @property def jdbc_url(self) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 15388ee7e..6b400b93e 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -75,7 +75,7 @@ def test_clickhouse_get_packages(package_version, apache_http_client_version, ex ("a.b.c", "5.3.1"), ], ) -def test_invalid_versions_raise_error(package_version, apache_http_client_version): +def test_clickhouse_get_packages_invalid_version(package_version, apache_http_client_version): with pytest.raises( ValueError, match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", diff --git a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py index f6b5f4e9b..f4c00f30f 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py @@ -14,14 +14,38 @@ def test_postgres_class_attributes(): def test_postgres_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Postgres.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert Postgres.package == "org.postgresql:postgresql:42.6.0" - - -def test_postgres_get_packages(): - assert Postgres.get_packages() == ["org.postgresql:postgresql:42.6.0"] - - -def test_oracle_missing_package(spark_no_packages): + assert Postgres.package == "org.postgresql:postgresql:42.7.3" + + +@pytest.mark.parametrize( + "package_version, expected_packages", + [ + (None, ["org.postgresql:postgresql:42.7.3"]), + ("42.7.3", ["org.postgresql:postgresql:42.7.3"]), + ("42.7.3-patch", ["org.postgresql:postgresql:42.7.3-patch"]), + ("42.6.0", ["org.postgresql:postgresql:42.6.0"]), + ], +) +def test_postgres_get_packages(package_version, expected_packages): + assert Postgres.get_packages(package_version=package_version) == expected_packages + + +@pytest.mark.parametrize( + "package_version", + [ + "42.2", + "abc", + ], +) +def test_postgres_get_packages_invalid_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", + ): + Postgres.get_packages(package_version=package_version) + + +def test_postgres_missing_package(spark_no_packages): msg = "Cannot import Java class 'org.postgresql.Driver'" with pytest.raises(ValueError, match=msg): Postgres( From 5819e213f14f687964ab73396ae8c1d07e6f87af Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Wed, 17 Apr 2024 11:52:48 +0300 Subject: [PATCH 15/71] [DOP-13850] - upgrade Oracle packages (#252) * [DOP-13850] - upgrade Oracle packages * [DOP-13850] - add min_digits version check to oracle package --- docs/changelog/next_release/252.feature.rst | 1 + .../db_connection/oracle/connection.py | 29 +++++++----- .../test_oracle_unit.py | 47 +++++++++++++++---- 3 files changed, 56 insertions(+), 21 deletions(-) create mode 100644 docs/changelog/next_release/252.feature.rst diff --git a/docs/changelog/next_release/252.feature.rst b/docs/changelog/next_release/252.feature.rst new file mode 100644 index 000000000..497b3c4ae --- /dev/null +++ b/docs/changelog/next_release/252.feature.rst @@ -0,0 +1 @@ +:class:`Oracle` connection now uses Oracle JDBC driver ``23.3.0.0.23.09``, upgraded from ``23.2.0.0``, and supports passing custom versions: ``Oracle.get_packages(java_version=..., package_version=...)``. diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index f239b85b2..a2d8d35b9 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -179,15 +179,18 @@ class Oracle(JDBCConnection): @classmethod def get_packages( cls, - java_version: str | Version | None = None, + java_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying custom JDBC driver versions for Oracle. |support_hooks| Parameters ---------- - java_version : str, default ``8`` - Java major version. + java_version : str, optional + Java major version, defaults to "8". Must be "8" or "11". + package_version : str, optional + Specifies the version of the Oracle JDBC driver to use. Defaults to "23.3.0.0.23.09". Examples -------- @@ -197,25 +200,29 @@ def get_packages( from onetl.connection import Oracle Oracle.get_packages() - Oracle.get_packages(java_version="8") + # specify Java and package versions + Oracle.get_packages(java_version="8", package_version="23.2.0.0") """ - if java_version is None: - java_version = "8" - java_ver = Version(java_version) + default_java_version = "8" + default_package_version = "23.3.0.23.09" + + java_ver = Version(java_version or default_java_version) if java_ver.major < 8: - raise ValueError(f"Java version must be at least 8, got {java_ver}") + raise ValueError(f"Java version must be at least 8, got {java_ver.major}") jre_ver = "8" if java_ver.major < 11 else "11" - return [f"com.oracle.database.jdbc:ojdbc{jre_ver}:23.2.0.0"] + jdbc_version = Version(package_version or default_package_version).min_digits(4) + + return [f"com.oracle.database.jdbc:ojdbc{jre_ver}:{jdbc_version}"] @classproperty def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`Oracle.package` will be removed in 1.0.0, use `Oracle.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.oracle.database.jdbc:ojdbc8:23.2.0.0" + return "com.oracle.database.jdbc:ojdbc8:23.3.0.23.09" @property def jdbc_url(self) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py index 6a875b8f7..cb2b9dc7b 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py @@ -14,11 +14,11 @@ def test_oracle_class_attributes(): def test_oracle_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Oracle.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert Oracle.package == "com.oracle.database.jdbc:ojdbc8:23.2.0.0" + assert Oracle.package == "com.oracle.database.jdbc:ojdbc8:23.3.0.23.09" def test_oracle_get_packages_no_input(): - assert Oracle.get_packages() == ["com.oracle.database.jdbc:ojdbc8:23.2.0.0"] + assert Oracle.get_packages() == ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"] @pytest.mark.parametrize("java_version", ["7", "6"]) @@ -28,17 +28,44 @@ def test_oracle_get_packages_java_version_not_supported(java_version): @pytest.mark.parametrize( - "java_version, package", + "java_version, package_version, expected_packages", [ - ("8", "com.oracle.database.jdbc:ojdbc8:23.2.0.0"), - ("9", "com.oracle.database.jdbc:ojdbc8:23.2.0.0"), - ("11", "com.oracle.database.jdbc:ojdbc11:23.2.0.0"), - ("17", "com.oracle.database.jdbc:ojdbc11:23.2.0.0"), - ("20", "com.oracle.database.jdbc:ojdbc11:23.2.0.0"), + (None, None, ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"]), + ("8", None, ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"]), + ("8", "23.3.0.23.09", ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"]), + ("8", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc8:21.13.0.0"]), + ("9", None, ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"]), + ("9", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc8:21.13.0.0"]), + ("11", None, ["com.oracle.database.jdbc:ojdbc11:23.3.0.23.09"]), + ("11", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc11:21.13.0.0"]), + ("17", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc11:21.13.0.0"]), + ("20", "23.3.0.23.09", ["com.oracle.database.jdbc:ojdbc11:23.3.0.23.09"]), ], ) -def test_oracle_get_packages(java_version, package): - assert Oracle.get_packages(java_version=java_version) == [package] +def test_oracle_get_packages(java_version, package_version, expected_packages): + assert Oracle.get_packages(java_version=java_version, package_version=package_version) == expected_packages + + +@pytest.mark.parametrize( + "java_version, package_version", + [ + ("8", "23.3.0"), + ("11", "23.3"), + ("11", "a.b.c.d"), + ], +) +def test_oracle_get_packages_invalid_version(java_version, package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 4\).", + ): + Oracle.get_packages(java_version=java_version, package_version=package_version) + + +@pytest.mark.parametrize("java_version", ["7", "6"]) +def test_oracle_get_packages_java_version_not_supported(java_version): + with pytest.raises(ValueError, match=f"Java version must be at least 8, got {java_version}"): + Oracle.get_packages(java_version=java_version) def test_oracle_missing_package(spark_no_packages): From 2145be9f1e3dad500f2be777c81d6d2cabc950ad Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Wed, 17 Apr 2024 12:08:55 +0300 Subject: [PATCH 16/71] [DOP-13851] - upgrade MySql packages (#253) --- docs/changelog/next_release/253.feature.rst | 1 + .../db_connection/mysql/connection.py | 20 ++++++++--- .../test_mysql_unit.py | 34 ++++++++++++++++--- 3 files changed, 45 insertions(+), 10 deletions(-) create mode 100644 docs/changelog/next_release/253.feature.rst diff --git a/docs/changelog/next_release/253.feature.rst b/docs/changelog/next_release/253.feature.rst new file mode 100644 index 000000000..46b364d95 --- /dev/null +++ b/docs/changelog/next_release/253.feature.rst @@ -0,0 +1 @@ +:class:`MySQL` connection now uses MySQL JDBC driver ``8.3.0``, upgraded from ``8.0.33``, and supports passing custom versions: ``MySQL.get_packages(package_version=...)``. diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index ebf43f668..da71de55b 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -6,6 +6,7 @@ from typing import ClassVar, Optional from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.mysql.dialect import MySQLDialect from onetl.hooks import slot, support_hooks @@ -104,28 +105,37 @@ class MySQL(JDBCConnection): @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages(cls, package_version: str | None = None) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying a custom JDBC driver version for MySQL. |support_hooks| + + Parameters + ---------- + package_version : str, optional + Specifies the version of the MySQL JDBC driver to use. Defaults to ``8.3.0``. Examples -------- - .. code:: python from onetl.connection import MySQL MySQL.get_packages() + # specify a custom package version + MySQL.get_packages(package_version="8.2.0") """ - return ["com.mysql:mysql-connector-j:8.0.33"] + default_version = "8.3.0" + version = Version(package_version or default_version).min_digits(3) + + return [f"com.mysql:mysql-connector-j:{version}"] @classproperty def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`MySQL.package` will be removed in 1.0.0, use `MySQL.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.mysql:mysql-connector-j:8.0.33" + return "com.mysql:mysql-connector-j:8.3.0" @property def jdbc_url(self): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py index ed730c418..c071e1196 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py @@ -14,11 +14,35 @@ def test_mysql_class_attributes(): def test_mysql_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MySQL.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert MySQL.package == "com.mysql:mysql-connector-j:8.0.33" - - -def test_mysql_get_packages(): - assert MySQL.get_packages() == ["com.mysql:mysql-connector-j:8.0.33"] + assert MySQL.package == "com.mysql:mysql-connector-j:8.3.0" + + +@pytest.mark.parametrize( + "package_version, expected_packages", + [ + (None, ["com.mysql:mysql-connector-j:8.3.0"]), + ("8.3.0", ["com.mysql:mysql-connector-j:8.3.0"]), + ("8.1.0", ["com.mysql:mysql-connector-j:8.1.0"]), + ("8.0.33", ["com.mysql:mysql-connector-j:8.0.33"]), + ], +) +def test_mysql_get_packages(package_version, expected_packages): + assert MySQL.get_packages(package_version=package_version) == expected_packages + + +@pytest.mark.parametrize( + "package_version", + [ + "8.3", + "abc", + ], +) +def test_mysql_get_packages_invalid_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", + ): + MySQL.get_packages(package_version=package_version) def test_mysql_missing_package(spark_no_packages): From a7abef966e1a4ca5466018b79c21f09641f7210c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 18 Apr 2024 11:13:32 +0000 Subject: [PATCH 17/71] Fix rendering table in Readme See https://github.com/github/markup/issues/1798 --- README.rst | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/README.rst b/README.rst index 0095b69d6..4e13a5303 100644 --- a/README.rst +++ b/README.rst @@ -66,43 +66,43 @@ Supported storages | Type | Storage | Powered by | +====================+==============+=========================================================================================================================+ | Database | Clickhouse | Apache Spark `JDBC Data Source `_ | -+ +--------------+ + ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | MSSQL | | -+ +--------------+ + ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | MySQL | | -+ +--------------+ + ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | Postgres | | -+ +--------------+ + ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | Oracle | | -+ +--------------+ + ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | Teradata | | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | Hive | Apache Spark `Hive integration `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | Kafka | Apache Spark `Kafka integration `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | Greenplum | VMware `Greenplum Spark connector `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | MongoDB | `MongoDB Spark connector `_ | +--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | File | HDFS | `HDFS Python client `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | S3 | `minio-py client `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | SFTP | `Paramiko library `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | FTP | `FTPUtil library `_ | -+ +--------------+ + ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | FTPS | | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | WebDAV | `WebdavClient3 library `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | Samba | `pysmb library `_ | +--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | -| +--------------+ + ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | SparkHDFS | | -| +--------------+-------------------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ | | SparkS3 | `Hadoop AWS `_ library | +--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ From 305d82ff54f13f7230dd8bbf29cfa44fc9e2447f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 18 Apr 2024 11:16:58 +0000 Subject: [PATCH 18/71] Fix rendering table in Readme See https://github.com/github/markup/issues/1798 --- README.rst | 103 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 44 deletions(-) diff --git a/README.rst b/README.rst index 4e13a5303..62404bf59 100644 --- a/README.rst +++ b/README.rst @@ -62,50 +62,65 @@ Requirements Supported storages ------------------ -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Type | Storage | Powered by | -+====================+==============+=========================================================================================================================+ -| Database | Clickhouse | Apache Spark `JDBC Data Source `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | MSSQL | | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | MySQL | | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Postgres | | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Oracle | | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Teradata | | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Hive | Apache Spark `Hive integration `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Kafka | Apache Spark `Kafka integration `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Greenplum | VMware `Greenplum Spark connector `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | MongoDB | `MongoDB Spark connector `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| File | HDFS | `HDFS Python client `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | S3 | `minio-py client `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | SFTP | `Paramiko library `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | FTP | `FTPUtil library `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | FTPS | | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | WebDAV | `WebdavClient3 library `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Samba | `pysmb library `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | SparkHDFS | | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | SparkS3 | `Hadoop AWS `_ library | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ - +Database +~~~~~~~~ + ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Storage | Powered by | ++==============+=========================================================================================================================+ +| Clickhouse | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MSSQL | | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MySQL | | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Postgres | | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Oracle | | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Teradata | | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Hive | Apache Spark `Hive integration `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Kafka | Apache Spark `Kafka integration `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Greenplum | VMware `Greenplum Spark connector `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MongoDB | `MongoDB Spark connector `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ + +File +~~~~ ++--------------+--------------------------------------------------------------------+ +| Storage | Powered by | ++==============+====================================================================+ +| HDFS | `HDFS Python client `_ | ++--------------+--------------------------------------------------------------------+ +| S3 | `minio-py client `_ | ++--------------+--------------------------------------------------------------------+ +| SFTP | `Paramiko library `_ | ++--------------+--------------------------------------------------------------------+ +| FTP | `FTPUtil library `_ | ++--------------+--------------------------------------------------------------------+ +| FTPS | | ++--------------+--------------------------------------------------------------------+ +| WebDAV | `WebdavClient3 library `_ | ++--------------+--------------------------------------------------------------------+ +| Samba | `pysmb library `_ | ++--------------+--------------------------------------------------------------------+ + +Files as DataFrame +~~~~~~~~~~~~~~~~~~ + ++--------------+---------------------------------------------------------------------------------------------------------------+ +| Storage | Powered by | ++==============+===============================================================================================================+ +| SparkLocalFS | Apache Spark `File Data Source `_ | ++--------------+---------------------------------------------------------------------------------------------------------------+ +| SparkHDFS | | ++--------------+---------------------------------------------------------------------------------------------------------------+ +| SparkS3 | `Hadoop AWS `_ library | ++--------------+---------------------------------------------------------------------------------------------------------------+ .. documentation From 8e0dc8973940b9567b849c069c5ac3e7bf97887a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 18 Apr 2024 11:21:13 +0000 Subject: [PATCH 19/71] Fix rendering table in Readme See https://github.com/github/markup/issues/1798 --- README.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index 62404bf59..625112b2c 100644 --- a/README.rst +++ b/README.rst @@ -70,15 +70,15 @@ Database +==============+=========================================================================================================================+ | Clickhouse | Apache Spark `JDBC Data Source `_ | +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| MSSQL | | +| MSSQL | Apache Spark `JDBC Data Source `_ | +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| MySQL | | +| MySQL | Apache Spark `JDBC Data Source `_ | +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Postgres | | +| Postgres | Apache Spark `JDBC Data Source `_ | +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Oracle | | +| Oracle | Apache Spark `JDBC Data Source `_ | +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Teradata | | +| Teradata | Apache Spark `JDBC Data Source `_ | +--------------+-------------------------------------------------------------------------------------------------------------------------+ | Hive | Apache Spark `Hive integration `_ | +--------------+-------------------------------------------------------------------------------------------------------------------------+ @@ -102,7 +102,7 @@ File +--------------+--------------------------------------------------------------------+ | FTP | `FTPUtil library `_ | +--------------+--------------------------------------------------------------------+ -| FTPS | | +| FTPS | `FTPUtil library `_ | +--------------+--------------------------------------------------------------------+ | WebDAV | `WebdavClient3 library `_ | +--------------+--------------------------------------------------------------------+ @@ -117,7 +117,7 @@ Files as DataFrame +==============+===============================================================================================================+ | SparkLocalFS | Apache Spark `File Data Source `_ | +--------------+---------------------------------------------------------------------------------------------------------------+ -| SparkHDFS | | +| SparkHDFS | Apache Spark `File Data Source `_ | +--------------+---------------------------------------------------------------------------------------------------------------+ | SparkS3 | `Hadoop AWS `_ library | +--------------+---------------------------------------------------------------------------------------------------------------+ From afa1b993d09851536e071a84bec5b5bd562c6100 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Thu, 18 Apr 2024 15:19:14 +0300 Subject: [PATCH 20/71] [DOP-13852] - upgrade MSSQL packages (#254) * [DOP-13852] - upgrade MSSQL packages * [DOP-13852] - downgrade default package_version to 12.6.1 --- docs/changelog/next_release/254.feature.rst | 1 + .../db_connection/mssql/connection.py | 33 ++++++++---- .../test_mssql_unit.py | 51 ++++++++++++------- 3 files changed, 57 insertions(+), 28 deletions(-) create mode 100644 docs/changelog/next_release/254.feature.rst diff --git a/docs/changelog/next_release/254.feature.rst b/docs/changelog/next_release/254.feature.rst new file mode 100644 index 000000000..0a8aff089 --- /dev/null +++ b/docs/changelog/next_release/254.feature.rst @@ -0,0 +1 @@ +:class:`MSSQL` connection now uses Microsoft SQL Server JDBC driver ``12.6.1``, upgraded from ``12.2.0``, and supports passing custom versions: ``MSSQL.get_packages(java_version=..., package_version=...)``. diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 1756146fa..8fa91b47a 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -149,43 +149,54 @@ class MSSQL(JDBCConnection): @classmethod def get_packages( cls, - java_version: str | Version | None = None, + java_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying custom JDBC driver versions for MSSQL. |support_hooks| Parameters ---------- - java_version : str, default ``8`` - Java major version. + java_version : str, optional + Java major version, defaults to ``8``. Must be ``8`` or ``11``. + package_version : str, optional + Specifies the version of the MSSQL JDBC driver to use. Defaults to ``12.6.1.``. Examples -------- - .. code:: python from onetl.connection import MSSQL MSSQL.get_packages() - MSSQL.get_packages(java_version="8") + # specify Java and package versions + MSSQL.get_packages(java_version="8", package_version="12.6.1.jre11") """ - if java_version is None: - java_version = "8" + default_java_version = "8" + default_package_version = "12.6.1" - java_ver = Version(java_version) + java_ver = Version(java_version or default_java_version) if java_ver.major < 8: raise ValueError(f"Java version must be at least 8, got {java_ver}") jre_ver = "8" if java_ver.major < 11 else "11" - return [f"com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre{jre_ver}"] + full_package_version = Version(package_version or default_package_version).min_digits(3) + + # check if a JRE suffix is already included + if ".jre" in str(full_package_version): + jdbc_version = full_package_version + else: + jdbc_version = Version(f"{full_package_version}.jre{jre_ver}") + + return [f"com.microsoft.sqlserver:mssql-jdbc:{jdbc_version}"] @classproperty def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`MSSQL.package` will be removed in 1.0.0, use `MSSQL.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8" + return "com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8" @property def jdbc_url(self) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py index 7b0328ca9..51a548166 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py @@ -14,31 +14,48 @@ def test_mssql_class_attributes(): def test_mssql_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MSSQL.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert MSSQL.package == "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8" + assert MSSQL.package == "com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8" -def test_mssql_get_packages_no_input(): - assert MSSQL.get_packages() == ["com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8"] - - -@pytest.mark.parametrize("java_version", ["7", "6"]) -def test_mssql_get_packages_java_version_not_supported(java_version): - with pytest.raises(ValueError, match=f"Java version must be at least 8, got {java_version}"): - MSSQL.get_packages(java_version=java_version) +@pytest.mark.parametrize( + "java_version, package_version, expected_packages", + [ + (None, None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), + ("8", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), + ("9", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), + ("11", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre11"]), + ("20", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre11"]), + ("8", "12.6.1.jre8", ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), + ("11", "12.6.1.jre11", ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre11"]), + ("11", "12.7.0.jre11-preview", ["com.microsoft.sqlserver:mssql-jdbc:12.7.0.jre11-preview"]), + ("8", "12.7.0.jre8-preview", ["com.microsoft.sqlserver:mssql-jdbc:12.7.0.jre8-preview"]), + ("8", "12.6.1", ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), + ("11", "12.6.1", ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre11"]), + ], +) +def test_mssql_get_packages(java_version, package_version, expected_packages): + assert MSSQL.get_packages(java_version=java_version, package_version=package_version) == expected_packages @pytest.mark.parametrize( - "java_version, package", + "package_version", [ - ("8", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8"), - ("9", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8"), - ("11", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre11"), - ("17", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre11"), - ("20", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre11"), + "12.7", + "abc", ], ) -def test_mssql_get_packages(java_version, package): - assert MSSQL.get_packages(java_version=java_version) == [package] +def test_mssql_get_packages_invalid_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", + ): + MSSQL.get_packages(package_version=package_version) + + +@pytest.mark.parametrize("java_version", ["7", "6"]) +def test_mssql_get_packages_java_version_not_supported(java_version): + with pytest.raises(ValueError, match=f"Java version must be at least 8, got {java_version}"): + MSSQL.get_packages(java_version=java_version) def test_mssql_missing_package(spark_no_packages): From b4d2ff3642fa51f8365871427762e257df64d265 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Thu, 18 Apr 2024 15:26:19 +0300 Subject: [PATCH 21/71] [DOP-13853] - upgrade MongoDB packages (#255) * [DOP-13853] - upgrade MongoDB packages * [DOP-13853] - connector_version -> package_version --- docs/changelog/next_release/255.feature.rst | 1 + .../db_connection/mongodb/connection.py | 38 ++++++------ .../test_mongodb_unit.py | 58 ++++++++++++------- 3 files changed, 57 insertions(+), 40 deletions(-) create mode 100644 docs/changelog/next_release/255.feature.rst diff --git a/docs/changelog/next_release/255.feature.rst b/docs/changelog/next_release/255.feature.rst new file mode 100644 index 000000000..55d780196 --- /dev/null +++ b/docs/changelog/next_release/255.feature.rst @@ -0,0 +1 @@ +:class:`MongoDB` connection now uses MongoDB Spark connector ``10.2.2``, upgraded from ``10.1.1``, and supports passing custom versions: ``MongoDB.get_packages(scala_version=..., package_version=...)``. diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 51fc617fb..5d4299f39 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -131,15 +131,12 @@ class MongoDB(DBConnection): @classmethod def get_packages( cls, - scala_version: str | Version | None = None, - spark_version: str | Version | None = None, + scala_version: str | None = None, + spark_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| - - .. warning:: - - You should pass at least one parameter. + Get package names to be downloaded by Spark. Allows specifying custom MongoDB Spark connector versions. |support_hooks| Parameters ---------- @@ -149,23 +146,25 @@ def get_packages( If ``None``, ``spark_version`` is used to determine Scala version. spark_version : str, optional - Spark version in format ``major.minor``. + Spark version in format ``major.minor``. Used only if ``scala_version=None``. - Used only if ``scala_version=None``. + package_version : str, optional + Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.2.2``. Examples -------- - .. code:: python from onetl.connection import MongoDB MongoDB.get_packages(scala_version="2.12") - MongoDB.get_packages(spark_version="3.4") + # specify custom connector version + MongoDB.get_packages(scala_version="2.12", package_version="10.2.2") """ - # Connector version is fixed, so we can perform checks for Scala/Spark version + default_package_version = "10.2.2" + if scala_version: scala_ver = Version(scala_version).min_digits(2) elif spark_version: @@ -176,11 +175,12 @@ def get_packages( else: raise ValueError("You should pass either `scala_version` or `spark_version`") - if scala_ver < Version("2.12") or scala_ver > Version("2.13"): - raise ValueError(f"Scala version must be 2.12 - 2.13, got {scala_ver.format('{0}.{1}')}") + connector_ver = Version(package_version or default_package_version).min_digits(2) + + if scala_ver < Version("2.12"): + raise ValueError(f"Scala version must be at least 2.12, got {scala_ver}") - # https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector - return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.format('{0}.{1}')}:10.1.1"] + return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.format('{0}.{1}')}:{connector_ver}"] @classproperty def package_spark_3_2(cls) -> str: @@ -190,7 +190,7 @@ def package_spark_3_2(cls) -> str: "use `MongoDB.get_packages(spark_version='3.2')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" @classproperty def package_spark_3_3(cls) -> str: @@ -200,7 +200,7 @@ def package_spark_3_3(cls) -> str: "use `MongoDB.get_packages(spark_version='3.3')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" @classproperty def package_spark_3_4(cls) -> str: @@ -210,7 +210,7 @@ def package_spark_3_4(cls) -> str: "use `MongoDB.get_packages(spark_version='3.4')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" @slot def pipeline( diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index c7e9e42b7..5333617b1 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -12,9 +12,9 @@ def test_mongodb_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MongoDB.get_packages(spark_version=") with pytest.warns(UserWarning, match=warning_msg): - assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" - assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" - assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" + assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" + assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" + assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" def test_mongodb_get_packages_no_input(): @@ -38,35 +38,51 @@ def test_mongodb_get_packages_spark_version_not_supported(spark_version): @pytest.mark.parametrize( "scala_version", [ + "2.9.2", "2.11", - "2.14", - "3.0", ], ) def test_mongodb_get_packages_scala_version_not_supported(scala_version): - with pytest.raises(ValueError, match=f"Scala version must be 2.12 - 2.13, got {scala_version}"): + with pytest.raises(ValueError, match=f"Scala version must be at least 2.12, got {scala_version}"): MongoDB.get_packages(scala_version=scala_version) @pytest.mark.parametrize( - "spark_version, scala_version, package", + "spark_version, scala_version, package_version, package", [ - # use Scala version directly - (None, "2.12", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - (None, "2.13", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), - # Detect Scala version by Spark version - ("3.2", None, "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - ("3.3", None, "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - ("3.4", None, "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - # Override Scala version detected automatically - ("3.2", "2.12", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - ("3.4", "2.13", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), - # Scala version contain three digits when only two needed - ("3.2.4", "2.12.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), + (None, "2.12", "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), + (None, "2.13", "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.13:10.2.2"), + ("3.2", None, "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), + ("3.3", None, "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), + ("3.4", None, "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), + ("3.2", "2.12", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), + ("3.4", "2.13", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), + ("3.2", "2.12", "10.2.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1"), + ("3.2", "2.12", "10.2.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0"), + ("3.2.4", "2.12.1", "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), ], ) -def test_mongodb_get_packages(spark_version, scala_version, package): - assert MongoDB.get_packages(spark_version=spark_version, scala_version=scala_version) == [package] +def test_mongodb_get_packages(spark_version, scala_version, package_version, package): + assert MongoDB.get_packages( + spark_version=spark_version, + scala_version=scala_version, + package_version=package_version, + ) == [package] + + +@pytest.mark.parametrize( + "package_version", + [ + "10", + "abc", + ], +) +def test_mongodb_get_packages_invalid_package_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 2\).", + ): + MongoDB.get_packages(scala_version="2.12", package_version=package_version) def test_mongodb_missing_package(spark_no_packages): From f9438525b54d86448ecdcfe01871570b2c0d45f2 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Thu, 18 Apr 2024 15:37:54 +0300 Subject: [PATCH 22/71] [DOP-14492] - upgrade Teradata packages (#256) --- docs/changelog/next_release/256.feature.rst | 1 + .../db_connection/teradata/connection.py | 21 ++++++++++++--- .../test_teradata_unit.py | 27 +++++++++++++++++-- 3 files changed, 43 insertions(+), 6 deletions(-) create mode 100644 docs/changelog/next_release/256.feature.rst diff --git a/docs/changelog/next_release/256.feature.rst b/docs/changelog/next_release/256.feature.rst new file mode 100644 index 000000000..752d7ffb1 --- /dev/null +++ b/docs/changelog/next_release/256.feature.rst @@ -0,0 +1 @@ +:class:`Teradata` connection now supports passing custom versions: ``Teradata.get_packages(package_version=...)``. diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 0dc3e67dd..93bd51468 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -6,6 +6,7 @@ from typing import ClassVar, Optional from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.teradata.dialect import TeradataDialect from onetl.hooks import slot @@ -124,21 +125,33 @@ class Teradata(JDBCConnection): @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages( + cls, + package_version: str | None = None, + ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying custom JDBC driver versions for Teradata. |support_hooks| + + Parameters + ---------- + package_version : str, optional + Specifies the version of the Teradata JDBC driver to use. Defaults to ``17.20.00.15``. Examples -------- - .. code:: python from onetl.connection import Teradata Teradata.get_packages() + # specify custom driver version + Teradata.get_packages(package_version="20.00.00.18") """ - return ["com.teradata.jdbc:terajdbc:17.20.00.15"] + default_package_version = "17.20.00.15" + version = Version(package_version or default_package_version).min_digits(4) + + return [f"com.teradata.jdbc:terajdbc:{version}"] @classproperty def package(cls) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py b/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py index dd9ba525d..fd90d31d4 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py @@ -17,8 +17,31 @@ def test_teradata_package(): assert Teradata.package == "com.teradata.jdbc:terajdbc:17.20.00.15" -def test_teradata_get_packages(): - assert Teradata.get_packages() == ["com.teradata.jdbc:terajdbc:17.20.00.15"] +@pytest.mark.parametrize( + "package_version, expected_package", + [ + (None, ["com.teradata.jdbc:terajdbc:17.20.00.15"]), + ("17.20.00.15", ["com.teradata.jdbc:terajdbc:17.20.00.15"]), + ("16.20.00.13", ["com.teradata.jdbc:terajdbc:16.20.00.13"]), + ], +) +def test_teradata_get_packages_valid_versions(package_version, expected_package): + assert Teradata.get_packages(package_version=package_version) == expected_package + + +@pytest.mark.parametrize( + "package_version", + [ + "20.00.13", + "abc", + ], +) +def test_teradata_get_packages_invalid_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 4\).", + ): + Teradata.get_packages(package_version=package_version) def test_teradata_missing_package(spark_no_packages): From 23ae583531771a904ebcc0908531a6b42934a6fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 19 Apr 2024 13:49:12 +0000 Subject: [PATCH 23/71] Return back old table view --- README.rst | 103 +++++++++++++++++++++++------------------------------ 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/README.rst b/README.rst index 625112b2c..0095b69d6 100644 --- a/README.rst +++ b/README.rst @@ -62,65 +62,50 @@ Requirements Supported storages ------------------ -Database -~~~~~~~~ - -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Storage | Powered by | -+==============+=========================================================================================================================+ -| Clickhouse | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| MSSQL | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| MySQL | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Postgres | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Oracle | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Teradata | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Hive | Apache Spark `Hive integration `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Kafka | Apache Spark `Kafka integration `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Greenplum | VMware `Greenplum Spark connector `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| MongoDB | `MongoDB Spark connector `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ - -File -~~~~ -+--------------+--------------------------------------------------------------------+ -| Storage | Powered by | -+==============+====================================================================+ -| HDFS | `HDFS Python client `_ | -+--------------+--------------------------------------------------------------------+ -| S3 | `minio-py client `_ | -+--------------+--------------------------------------------------------------------+ -| SFTP | `Paramiko library `_ | -+--------------+--------------------------------------------------------------------+ -| FTP | `FTPUtil library `_ | -+--------------+--------------------------------------------------------------------+ -| FTPS | `FTPUtil library `_ | -+--------------+--------------------------------------------------------------------+ -| WebDAV | `WebdavClient3 library `_ | -+--------------+--------------------------------------------------------------------+ -| Samba | `pysmb library `_ | -+--------------+--------------------------------------------------------------------+ - -Files as DataFrame -~~~~~~~~~~~~~~~~~~ - -+--------------+---------------------------------------------------------------------------------------------------------------+ -| Storage | Powered by | -+==============+===============================================================================================================+ -| SparkLocalFS | Apache Spark `File Data Source `_ | -+--------------+---------------------------------------------------------------------------------------------------------------+ -| SparkHDFS | Apache Spark `File Data Source `_ | -+--------------+---------------------------------------------------------------------------------------------------------------+ -| SparkS3 | `Hadoop AWS `_ library | -+--------------+---------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Type | Storage | Powered by | ++====================+==============+=========================================================================================================================+ +| Database | Clickhouse | Apache Spark `JDBC Data Source `_ | ++ +--------------+ + +| | MSSQL | | ++ +--------------+ + +| | MySQL | | ++ +--------------+ + +| | Postgres | | ++ +--------------+ + +| | Oracle | | ++ +--------------+ + +| | Teradata | | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Hive | Apache Spark `Hive integration `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Kafka | Apache Spark `Kafka integration `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Greenplum | VMware `Greenplum Spark connector `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | MongoDB | `MongoDB Spark connector `_ | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| File | HDFS | `HDFS Python client `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | S3 | `minio-py client `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | SFTP | `Paramiko library `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | FTP | `FTPUtil library `_ | ++ +--------------+ + +| | FTPS | | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | WebDAV | `WebdavClient3 library `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Samba | `pysmb library `_ | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | +| +--------------+ + +| | SparkHDFS | | +| +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | SparkS3 | `Hadoop AWS `_ library | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ + .. documentation From 16cc41f93cb23e0e71f2891c950ca6db5fc29ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 19 Apr 2024 13:50:05 +0000 Subject: [PATCH 24/71] Fix rendering table in Readme See https://github.com/github/markup/issues/1798 --- README.rst | 103 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 44 deletions(-) diff --git a/README.rst b/README.rst index 0095b69d6..625112b2c 100644 --- a/README.rst +++ b/README.rst @@ -62,50 +62,65 @@ Requirements Supported storages ------------------ -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Type | Storage | Powered by | -+====================+==============+=========================================================================================================================+ -| Database | Clickhouse | Apache Spark `JDBC Data Source `_ | -+ +--------------+ + -| | MSSQL | | -+ +--------------+ + -| | MySQL | | -+ +--------------+ + -| | Postgres | | -+ +--------------+ + -| | Oracle | | -+ +--------------+ + -| | Teradata | | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Hive | Apache Spark `Hive integration `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Kafka | Apache Spark `Kafka integration `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Greenplum | VMware `Greenplum Spark connector `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | MongoDB | `MongoDB Spark connector `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| File | HDFS | `HDFS Python client `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | S3 | `minio-py client `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | SFTP | `Paramiko library `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | FTP | `FTPUtil library `_ | -+ +--------------+ + -| | FTPS | | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | WebDAV | `WebdavClient3 library `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Samba | `pysmb library `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | -| +--------------+ + -| | SparkHDFS | | -| +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | SparkS3 | `Hadoop AWS `_ library | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ - +Database +~~~~~~~~ + ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Storage | Powered by | ++==============+=========================================================================================================================+ +| Clickhouse | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MSSQL | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MySQL | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Postgres | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Oracle | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Teradata | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Hive | Apache Spark `Hive integration `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Kafka | Apache Spark `Kafka integration `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Greenplum | VMware `Greenplum Spark connector `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MongoDB | `MongoDB Spark connector `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ + +File +~~~~ ++--------------+--------------------------------------------------------------------+ +| Storage | Powered by | ++==============+====================================================================+ +| HDFS | `HDFS Python client `_ | ++--------------+--------------------------------------------------------------------+ +| S3 | `minio-py client `_ | ++--------------+--------------------------------------------------------------------+ +| SFTP | `Paramiko library `_ | ++--------------+--------------------------------------------------------------------+ +| FTP | `FTPUtil library `_ | ++--------------+--------------------------------------------------------------------+ +| FTPS | `FTPUtil library `_ | ++--------------+--------------------------------------------------------------------+ +| WebDAV | `WebdavClient3 library `_ | ++--------------+--------------------------------------------------------------------+ +| Samba | `pysmb library `_ | ++--------------+--------------------------------------------------------------------+ + +Files as DataFrame +~~~~~~~~~~~~~~~~~~ + ++--------------+---------------------------------------------------------------------------------------------------------------+ +| Storage | Powered by | ++==============+===============================================================================================================+ +| SparkLocalFS | Apache Spark `File Data Source `_ | ++--------------+---------------------------------------------------------------------------------------------------------------+ +| SparkHDFS | Apache Spark `File Data Source `_ | ++--------------+---------------------------------------------------------------------------------------------------------------+ +| SparkS3 | `Hadoop AWS `_ library | ++--------------+---------------------------------------------------------------------------------------------------------------+ .. documentation From 511a330d028c2b9a2cc8ded8d7ddab2e06f704db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 22 Apr 2024 15:37:33 +0000 Subject: [PATCH 25/71] [DOP-14025] Cleanup unused cache after merging PR --- .github/workflows/cache-cleanup.yml | 40 +++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/cache-cleanup.yml diff --git a/.github/workflows/cache-cleanup.yml b/.github/workflows/cache-cleanup.yml new file mode 100644 index 000000000..0f6b3fc19 --- /dev/null +++ b/.github/workflows/cache-cleanup.yml @@ -0,0 +1,40 @@ +name: Cleanup caches after merge +on: + pull_request: + types: + - closed + workflow_dispatch: + +jobs: + cleanup: + runs-on: ubuntu-latest + permissions: + # `actions:write` permission is required to delete caches + # See also: https://docs.github.com/en/rest/actions/cache?apiVersion=2022-11-28#delete-a-github-actions-cache-for-a-repository-using-a-cache-id + actions: write + contents: read + + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Cleanup cache + run: | + gh extension install actions/gh-actions-cache + + REPO=${{ github.repository }} + BRANCH=refs/pull/${{ github.event.pull_request.number }}/merge + + echo "Fetching list of cache key" + cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH --limit 100 --sort size | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $cacheKeysForPR + do + gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From b296810ba41760bdb04c3cfac2f95343ea34fde4 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Tue, 23 Apr 2024 13:43:28 +0300 Subject: [PATCH 26/71] [DOP-13840] - add JSON.parse_column and JSON.serialize_column methods (#257) * [DOP-13840] - add JSON.parse_column, JSON.serialize_column methods * [DOP-13840] - add tests * [DOP-13840] - update documentation --- docs/changelog/next_release/257.feature.rst | 1 + .../db_connection/clickhouse/types.rst | 28 +++--- .../db_connection/greenplum/types.rst | 39 ++++---- docs/connection/db_connection/mysql/types.rst | 29 +++--- .../connection/db_connection/oracle/types.rst | 27 +++--- .../db_connection/postgres/types.rst | 40 ++++----- docs/file_df/file_formats/json.rst | 2 +- onetl/file/format/json.py | 90 ++++++++++++++++++- .../test_json_integration.py | 74 +++++++++++++++ 9 files changed, 242 insertions(+), 88 deletions(-) create mode 100644 docs/changelog/next_release/257.feature.rst diff --git a/docs/changelog/next_release/257.feature.rst b/docs/changelog/next_release/257.feature.rst new file mode 100644 index 000000000..e72de4596 --- /dev/null +++ b/docs/changelog/next_release/257.feature.rst @@ -0,0 +1 @@ +Add ``JSON.parse_column`` and ``JSON.serialize_column`` methods to facilitate direct parsing of JSON strings into Spark DataFrame columns and serialization of structured DataFrame columns back into JSON strings. diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index b0171b503..9c579c08d 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -336,13 +336,17 @@ Explicit type cast ~~~~~~~~~~~~ Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format, -and then cast string column in resulting dataframe to proper type using `from_json `_: -.. code:: python +For parsing JSON columns in ClickHouse, :obj:`JSON.parse_column ` method. + +.. code-block:: python - from pyspark.sql.functions import from_json from pyspark.sql.types import ArrayType, IntegerType + from onetl.file.format import JSON + from onetl.connection import ClickHouse + from onetl.db import DBReader + reader = DBReader( connection=clickhouse, columns=[ @@ -357,16 +361,22 @@ and then cast string column in resulting dataframe to proper type using `from_js df = df.select( df.id, - from_json(df.array_column, column_type).alias("array_column"), + JSON().parse_column("array_column", column_type), ) ``DBWriter`` ~~~~~~~~~~~~ -Convert dataframe column to JSON using `to_json `_, -and write it as ``String`` column in Clickhouse: +For writing JSON data to ClickHouse, use the :obj:`JSON.serialize_column ` method to convert a DataFrame column to JSON format efficiently and write it as a ``String`` column in Clickhouse. + -.. code:: python +.. code-block:: python + + from onetl.file.format import JSON + from onetl.connection import ClickHouse + from onetl.db import DBWriter + + clickhouse = ClickHouse(...) clickhouse.execute( """ @@ -379,11 +389,9 @@ and write it as ``String`` column in Clickhouse: """, ) - from pyspark.sql.functions import to_json - df = df.select( df.id, - to_json(df.array_column).alias("array_column_json"), + JSON().serialize_column(df.array_column).alias("array_column_json"), ) writer.run(df) diff --git a/docs/connection/db_connection/greenplum/types.rst b/docs/connection/db_connection/greenplum/types.rst index baea34914..63e7b9a2b 100644 --- a/docs/connection/db_connection/greenplum/types.rst +++ b/docs/connection/db_connection/greenplum/types.rst @@ -301,35 +301,29 @@ Explicit type cast ``DBReader`` ~~~~~~~~~~~~ -Unfortunately, it is not possible to cast unsupported column to some supported type on ``DBReader`` side: +Direct casting of Greenplum types is not supported by DBReader due to the connector’s implementation specifics. .. code-block:: python - DBReader( + reader = DBReader( connection=greenplum, # will fail - columns=["CAST(column AS text)"], + columns=["CAST(unsupported_column AS text)"], ) -This is related to Greenplum connector implementation. Instead of passing this ``CAST`` expression to ``SELECT`` query -as is, it performs type cast on Spark side, so this syntax is not supported. - -But there is a workaround - create a view with casting unsupported column to ``text`` (or any other supported type). +But there is a workaround - create a view with casting unsupported column to text (or any other supported type). +For example, you can use `to_json `_ Postgres function to convert column of any type to string representation and then parse this column on Spark side using :obj:`JSON.parse_column ` method. -For example, you can use ``to_json`` Postgres function for convert column of any type to string representation. -You can then parse this column on Spark side using `from_json `_: - -.. code:: python +.. code-block:: python - from pyspark.sql.functions import from_json from pyspark.sql.types import ArrayType, IntegerType from onetl.connection import Greenplum from onetl.db import DBReader + from onetl.file.format import JSON greenplum = Greenplum(...) - # create view with proper type cast greenplum.execute( """ CREATE VIEW schema.view_with_json_column AS @@ -350,29 +344,26 @@ You can then parse this column on Spark side using `from_json ` method. -For example, you can convert data using `to_json `_ function. - -.. code:: python +.. code-block:: python - from pyspark.sql.functions import to_json from onetl.connection import Greenplum - from onetl.db import DBReader + from onetl.db import DBWriter + from onetl.file.format import JSON greenplum = Greenplum(...) @@ -390,7 +381,7 @@ For example, you can convert data using `to_json `_ MySQL function -to convert column of any type to string representation, and then parse this column on Spark side using -`from_json `_: +It is also possible to use `JSON_OBJECT `_ MySQL function and parse JSON columns in MySQL with the :obj:`JSON.parse_column ` method. .. code-block:: python - from pyspark.sql.types import IntegerType, StructField, StructType + from pyspark.sql.types import IntegerType, StructType, StructField from onetl.connection import MySQL from onetl.db import DBReader + from onetl.file.format import JSON mysql = MySQL(...) @@ -314,30 +313,30 @@ to convert column of any type to string representation, and then parse this colu ) df = reader.run() - # Spark requires all columns to have some type, describe it - column_type = StructType([StructField("key", IntegerType())]) + json_scheme = StructType([StructField("key", IntegerType())]) - # cast column content to proper Spark type df = df.select( df.id, df.supported_column, # explicit cast df.unsupported_column_str.cast("integer").alias("parsed_integer"), - # or explicit json parsing - from_json(df.json_column, schema).alias("struct_column"), + JSON().parse_column("json_column", json_scheme).alias("struct_column"), ) ``DBWriter`` ~~~~~~~~~~~~ -Convert dataframe column to JSON using `to_json `_, -and write it as ``text`` column in MySQL: +To write JSON data to a ``json`` or ``text`` column in a MySQL table, use the :obj:`JSON.serialize_column ` method. -.. code:: python +.. code-block:: python + + from onetl.connection import MySQL + from onetl.db import DBWriter + from onetl.file.format import JSON mysql.execute( """ - CREATE TABLE schema.target_tbl AS ( + CREATE TABLE schema.target_tbl ( id bigint, array_column_json json -- any string type, actually ) @@ -345,11 +344,9 @@ and write it as ``text`` column in MySQL: """, ) - from pyspark.sql.functions import to_json - df = df.select( df.id, - to_json(df.array_column).alias("array_column_json"), + JSON().serialize_column(df.array_column).alias("array_column_json"), ) writer.run(df) diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst index c06e43b13..330a460a8 100644 --- a/docs/connection/db_connection/oracle/types.rst +++ b/docs/connection/db_connection/oracle/types.rst @@ -301,12 +301,12 @@ For example, you can use ``CAST(column AS CLOB)`` to convert data to string repr It is also possible to use `JSON_ARRAY `_ or `JSON_OBJECT `_ Oracle functions -to convert column of any type to string representation, and then parse this column on Spark side using -`from_json `_: +to convert column of any type to string representation. Then this JSON string can then be effectively parsed using the :obj:`JSON.parse_column ` method. .. code-block:: python - from pyspark.sql.types import IntegerType + from onetl.file.format import JSON + from pyspark.sql.types import IntegerType, StructType, StructField from onetl.connection import Oracle from onetl.db import DBReader @@ -325,32 +325,27 @@ to convert column of any type to string representation, and then parse this colu ) df = reader.run() - # Spark requires all columns to have some type, describe it - column_type = IntegerType() + json_scheme = StructType([StructField("key", IntegerType())]) - # cast column content to proper Spark type df = df.select( df.id, df.supported_column, - # explicit cast df.unsupported_column_str.cast("integer").alias("parsed_integer"), - # or explicit json parsing - from_json(df.array_column_json, schema).alias("array_column"), + JSON().parse_column("array_column_json", json_scheme).alias("array_column"), ) ``DBWriter`` ~~~~~~~~~~~~ -It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Oracle table. +It is always possible to convert data on Spark side to string, and then write it to text column in Oracle table. -For example, you can convert data using `to_json `_ function. +To serialize and write JSON data to a ``text`` or ``json`` column in an Oracle table use the :obj:`JSON.serialize_column ` method. -.. code:: python - - from pyspark.sql.functions import to_json +.. code-block:: python from onetl.connection import Oracle - from onetl.db import DBReader + from onetl.db import DBWriter + from onetl.file.format import JSON oracle = Oracle(...) @@ -367,7 +362,7 @@ For example, you can convert data using `to_json `_ Postgres function -to convert column of any type to string representation, and then parse this column on Spark side using -`from_json `_: +It is also possible to use `to_json `_ Postgres function to convert column of any type to string representation, and then parse this column on Spark side you can use the :obj:`JSON.parse_column ` method: .. code-block:: python - from pyspark.sql.functions import from_json from pyspark.sql.types import IntegerType from onetl.connection import Postgres from onetl.db import DBReader + from onetl.file.format import JSON postgres = Postgres(...) @@ -374,35 +372,37 @@ to convert column of any type to string representation, and then parse this colu ) df = reader.run() - # Spark requires all columns to have some type, describe it - column_type = IntegerType() - - # cast column content to proper Spark type + json_schema = StructType( + [ + StructField("id", IntegerType(), nullable=True), + StructField("name", StringType(), nullable=True), + ..., + ] + ) df = df.select( df.id, df.supported_column, # explicit cast df.unsupported_column_str.cast("integer").alias("parsed_integer"), - # or explicit json parsing - from_json(df.array_column_json, schema).alias("array_column"), + JSON().parse_column("array_column_json", json_schema).alias("json_string"), ) ``DBWriter`` ~~~~~~~~~~~~ -It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Postgres table. - -Using ``to_json`` -^^^^^^^^^^^^^^^^^ +It is always possible to convert data on the Spark side to a string, and then write it to a text column in a Postgres table. -For example, you can convert data using `to_json `_ function. +Using JSON.serialize_column +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +You can use the :obj:`JSON.serialize_column ` method for data serialization: -.. code:: python +.. code-block:: python - from pyspark.sql.functions import to_json + from onetl.file.format import JSON + from pyspark.sql.functions import col from onetl.connection import Postgres - from onetl.db import DBReader + from onetl.db import DBWriter postgres = Postgres(...) @@ -419,7 +419,7 @@ For example, you can convert data using `to_json None: # always available pass + + def parse_column(self, column: str | Column, schema: StructType | ArrayType | MapType) -> Column: + """ + Parses a JSON string column to a structured Spark SQL column using Spark's `from_json `_ function, based on the provided schema. + + Parameters + ---------- + column : str | Column + The name of the column or the Column object containing JSON strings to parse. + + schema : StructType | ArrayType | MapType + The schema to apply when parsing the JSON data. This defines the structure of the output DataFrame column. + + Returns + ------- + Column + A new Column object with data parsed from JSON string to the specified structure. + + Examples + -------- + .. code:: python + + from pyspark.sql import SparkSession + from pyspark.sql.types import StructType, StructField, IntegerType, StringType + + spark = SparkSession.builder.appName("JSONParsingExample").getOrCreate() + json = JSON() + df = spark.createDataFrame([(1, '{"id":123, "name":"John"}')], ["id", "json_string"]) + schema = StructType( + [StructField("id", IntegerType()), StructField("name", StringType())] + ) + + parsed_df = df.withColumn("parsed_json", json.parse_column("json_string", schema)) + parsed_df.show() + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col, from_json + + self.check_if_supported(SparkSession._instantiatedSession) # noqa: WPS437 + + if isinstance(column, Column): + column_name, column = column._jc.toString(), column.cast("string") # noqa: WPS437 + else: + column_name, column = column, col(column).cast("string") + + return from_json(column, schema, self.dict()).alias(column_name) + + def serialize_column(self, column: str | Column) -> Column: + """ + Serializes a structured Spark SQL column into a JSON string column using Spark's `to_json `_ function. + + Parameters + ---------- + column : str | Column + The name of the column or the Column object containing the data to serialize to JSON. + + Returns + ------- + Column + A new Column object with data serialized from Spark SQL structures to JSON string. + + Examples + -------- + .. code:: python + + from pyspark.sql import SparkSession + from pyspark.sql.functions import struct + + spark = SparkSession.builder.appName("JSONSerializationExample").getOrCreate() + json = JSON() + df = spark.createDataFrame([(123, "John")], ["id", "name"]) + df = df.withColumn("combined", struct("id", "name")) + + serialized_df = df.withColumn("json_string", json.serialize_column("combined")) + serialized_df.show() + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col, to_json + + self.check_if_supported(SparkSession._instantiatedSession) # noqa: WPS437 + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column) + + return to_json(column, self.dict()).alias(column_name) diff --git a/tests/tests_integration/test_file_format_integration/test_json_integration.py b/tests/tests_integration/test_file_format_integration/test_json_integration.py index f1fbd1380..9bb477c90 100644 --- a/tests/tests_integration/test_file_format_integration/test_json_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_json_integration.py @@ -9,6 +9,20 @@ from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import JSON +try: + from pyspark.sql import Row + from pyspark.sql.functions import col + from pyspark.sql.types import ( + ArrayType, + IntegerType, + MapType, + StringType, + StructField, + StructType, + ) +except ImportError: + pytest.skip("Missing pyspark", allow_module_level=True) + try: from tests.util.assert_df import assert_equal_df except ImportError: @@ -62,3 +76,63 @@ def test_json_writer_is_not_supported( format=JSON(), target_path=json_root, ) + + +@pytest.mark.parametrize( + "json_string, schema, expected", + [ + ( + '{"id": 1, "name": "Alice"}', + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + Row(id=1, name="Alice"), + ), + ( + '[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]', + ArrayType(StructType([StructField("id", IntegerType()), StructField("name", StringType())])), + [Row(id=1, name="Alice"), Row(id=2, name="Bob")], + ), + ( + '{"key1": "value1", "key2": "value2"}', + MapType(StringType(), StringType()), + {"key1": "value1", "key2": "value2"}, + ), + ], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_json_parse_column(spark, json_string, schema, expected, column_type): + json = JSON() + df = spark.createDataFrame([(json_string,)], ["json_column"]) + parsed_df = df.select(json.parse_column(column_type("json_column"), schema)) + assert parsed_df.columns == ["json_column"] + assert parsed_df.select("json_column").first()["json_column"] == expected + + +@pytest.mark.parametrize( + "data, schema, expected_json", + [ + ( + {"id": 1, "name": "Alice"}, + StructType([StructField("id", IntegerType(), True), StructField("name", StringType(), True)]), + '{"id":1,"name":"Alice"}', + ), + ( + [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], + ArrayType(StructType([StructField("id", IntegerType(), True), StructField("name", StringType(), True)])), + '[{"id":1,"name":"Alice"},{"id":2,"name":"Bob"}]', + ), + ( + {"key1": "value1", "key2": "value2"}, + MapType(StringType(), StringType()), + '{"key1":"value1","key2":"value2"}', + ), + ], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_json_serialize_column(spark, data, schema, expected_json, column_type): + json = JSON() + df_schema = StructType([StructField("json_string", schema)]) + df = spark.createDataFrame([(data,)], df_schema) + serialized_df = df.select(json.serialize_column(column_type("json_string"))) + actual_json = serialized_df.select("json_string").first()["json_string"] + assert actual_json == expected_json + assert serialized_df.columns == ["json_string"] From f96ef47dc94fd681e902cacdd8596219c5b19658 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Wed, 24 Apr 2024 18:02:43 +0300 Subject: [PATCH 27/71] [DOP-13844] - implement CSV.parse_column, CSV.serialize_column (#258) * [DOP-13844] - implement CSV.parse_column, CSV.serialize_column * [DOP-13844] - add tests * [DOP-13844] - add kafka docs example * [DOP-13844] - add kafka json docs example * [DOP-13844] - remove header tests * [DOP-13844] - add csv serialization tests --- docs/changelog/next_release/258.feature.rst | 1 + .../db_connection/kafka/format_handling.rst | 154 ++++++++++++++++++ docs/connection/db_connection/kafka/index.rst | 6 + docs/file_df/file_formats/csv.rst | 2 +- onetl/file/format/csv.py | 111 ++++++++++++- onetl/file/format/json.py | 4 +- .../test_csv_integration.py | 90 +++++++++- 7 files changed, 363 insertions(+), 5 deletions(-) create mode 100644 docs/changelog/next_release/258.feature.rst create mode 100644 docs/connection/db_connection/kafka/format_handling.rst diff --git a/docs/changelog/next_release/258.feature.rst b/docs/changelog/next_release/258.feature.rst new file mode 100644 index 000000000..e962e3156 --- /dev/null +++ b/docs/changelog/next_release/258.feature.rst @@ -0,0 +1 @@ +Add ``CSV.parse_column`` and ``CSV.serialize_column`` methods to facilitate direct parsing of CSV strings into Spark DataFrame CSV columns and serialization of structured DataFrame CSV columns back into CSV strings. diff --git a/docs/connection/db_connection/kafka/format_handling.rst b/docs/connection/db_connection/kafka/format_handling.rst new file mode 100644 index 000000000..74307c3f6 --- /dev/null +++ b/docs/connection/db_connection/kafka/format_handling.rst @@ -0,0 +1,154 @@ +.. _kafka-data-format-handling: + +Data Format Handling +-------------------- + +Kafka topics can store data in various formats including ``JSON``, ``CSV``, ``Avro``, etc. Below are examples of how to handle data formats using custom methods for parsing and serialization integrated with Spark's DataFrame operations. + +CSV Format Handling +------------------- + +``DBReader`` +~~~~~~~~~~~~ + +To handle CSV formatted data stored in Kafka topics, use the :obj:`CSV.parse_column ` method. This method allows you to convert a CSV string column directly into a structured Spark DataFrame using a specified schema. + +.. code-block:: python + + from pyspark.sql import SparkSession + from pyspark.sql.types import StructType, StructField, IntegerType, StringType + + from onetl.db import DBReader + from onetl.file.format import CSV + from onetl.connection import Kafka + + spark = SparkSession.builder.appName("KafkaCSVExample").getOrCreate() + + kafka = Kafka(addresses=["kafka-broker1:9092"], cluster="example-cluster", spark=spark) + csv = CSV(sep=",", encoding="utf-8") + + reader = DBReader( + connection=kafka, + topic="topic_name", + ) + df = reader.run() + + df.show() + # +----+--------+--------+---------+------+-----------------------+-------------+ + # |key |value |topic |partition|offset|timestamp |timestampType| + # +----+--------+--------+---------+------+-----------------------+-------------+ + # |[31]|Alice,20|topicCSV|0 |0 |2024-04-24 13:02:25.911|0 | + # |[32]|Bob,25 |topicCSV|0 |1 |2024-04-24 13:02:25.922|0 | + # +----+--------+--------+---------+------+-----------------------+-------------+ + + # schema for parsing CSV data from Kafka + csv_schema = StructType( + [ + StructField("name", StringType(), nullable=True), + StructField("age", IntegerType(), nullable=True), + ] + ) + + parsed_df = df.select(csv.parse_column("value", csv_schema)) + parse_df.select("value").first() + # Row(value=Row(name='Alice', age=20)) + +``DBWriter`` +~~~~~~~~~~~~ + +To serialize structured data into CSV format and write it back to a Kafka topic, use the :obj:`CSV.serialize_column ` method. + +.. code-block:: python + + from onetl.db import DBWriter + from onetl.file.format import CSV + from onetl.connection import Kafka + + kafka = Kafka(...) + csv = CSV(sep=",", encoding="utf-8") + + df.select("value").show() + # +------------+ + # |value | + # +------------+ + # |{Alice, 20} | + # |{Bob, 25} | + # +------------+ + + + # serializing data parsed in reading example into CSV format + serialized_df = df.select(csv.serialize_column("value")) + + writer = DBWriter(connection=kafka, topic="topic_name") + writer.run(serialized_df) + + + serialized_df.show() + # +---+-----------+ + # |key|value | + # +---+-----------+ + # | 1|"Alice,20" | + # | 2|"Bob,25" | + # +---+-----------+ + +JSON Format Handling +-------------------- + +``DBReader`` +~~~~~~~~~~~~ + +To process JSON formatted data from Kafka, use the :obj:`JSON.parse_column ` method. + +.. code-block:: python + + from onetl.file.format import JSON + + df.show() + # +----+-------------------------+----------+---------+------+-----------------------+-------------+ + # |key |value |topic |partition|offset|timestamp |timestampType| + # +----+-------------------------+----------+---------+------+-----------------------+-------------+ + # |[31]|{"name":"Alice","age":20}|topicKafka|0 |0 |2024-04-24 16:51:11.739|0 | + # |[32]|{"name":"Bob","age":25} |topicKafka|0 |1 |2024-04-24 16:51:11.749|0 | + # +----+-------------------------+----------+---------+------+-----------------------+-------------+ + + json = JSON() + + json_schema = StructType( + [ + StructField("name", StringType(), nullable=True), + StructField("age", IntegerType(), nullable=True), + ] + ) + + parsed_json_df = df.select(json.parse_column("value", json_schema)) + + parsed_json_df.first() + # Row(value=Row(name='Alice', age=20)) + +``DBWriter`` +~~~~~~~~~~~~ + +For serializing data into JSON format and sending it back to Kafka, use the :obj:`JSON.serialize_column `. + +.. code-block:: python + + from onetl.file.format import JSON + + df.show() + # +-----------+ + # |value | + # +-----------+ + # |{Alice, 20}| + # |{Bob, 25} | + # +-----------+ + + json = JSON() + + serialized_json_df = df.select(json.serialize_column("data_column")) + serialized_json_df.show() + # +-------------------------+ + # |value | + # +-------------------------+ + # |{"name":"Alice","age":20}| + # |{"name":"Bob","age":25} | + # +-------------------------+ diff --git a/docs/connection/db_connection/kafka/index.rst b/docs/connection/db_connection/kafka/index.rst index a02aaecdc..6076c8087 100644 --- a/docs/connection/db_connection/kafka/index.rst +++ b/docs/connection/db_connection/kafka/index.rst @@ -31,6 +31,12 @@ Kafka read write +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + format_handling + .. toctree:: :maxdepth: 1 :caption: For developers diff --git a/docs/file_df/file_formats/csv.rst b/docs/file_df/file_formats/csv.rst index 44201e71a..93119ed25 100644 --- a/docs/file_df/file_formats/csv.rst +++ b/docs/file_df/file_formats/csv.rst @@ -6,4 +6,4 @@ CSV .. currentmodule:: onetl.file.format.csv .. autoclass:: CSV - :members: __init__ + :members: __init__, parse_column, serialize_column diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index 41794f7ef..153c8923d 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import warnings from typing import TYPE_CHECKING, ClassVar try: @@ -13,7 +14,8 @@ from onetl.hooks import slot, support_hooks if TYPE_CHECKING: - from pyspark.sql import SparkSession + from pyspark.sql import Column, SparkSession + from pyspark.sql.types import StructType READ_WRITE_OPTIONS = frozenset( @@ -109,3 +111,110 @@ class Config: def check_if_supported(cls, spark: SparkSession) -> None: # always available pass + + def parse_column(self, column: str | Column, schema: StructType) -> Column: + """ + Parses a CSV string column to a structured Spark SQL column using Spark's `from_csv `_ function, based on the provided schema. + + .. note:: + The ``from_csv`` function is available from Apache Spark ``3.0.0`` onwards. + + Parameters + ---------- + column : str | Column + The name of the column or the Column object containing CSV strings to parse. + + schema : StructType + The schema to apply when parsing the CSV data. This defines the structure of the output DataFrame CSV column. + + Returns + ------- + Column + A new Column object with data parsed from CSV string to the specified CSV structured format. + + Examples + -------- + .. code:: python + + from pyspark.sql import SparkSession + from pyspark.sql.types import StructType, StructField, IntegerType, StringType + + spark = SparkSession.builder.appName("CSVParsingExample").getOrCreate() + csv = CSV() + df = spark.createDataFrame([("1,some",), ("2,another",)], ["csv_string"]) + schema = StructType( + [StructField("id", IntegerType()), StructField("text", StringType())] + ) + + parsed_df = df.select(csv.parse_column("csv_string", schema)) + parsed_df.show() + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col, from_csv + + self.check_if_supported(SparkSession._instantiatedSession) # noqa: WPS437 + self._check_unsupported_serialization_options() + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column).cast("string") + + schema_string = schema.simpleString() + return from_csv(column, schema_string, self.dict()).alias(column_name) + + def serialize_column(self, column: str | Column) -> Column: + """ + Serializes a structured Spark SQL column into a CSV string column using Spark's `to_csv `_ function. + + .. note:: + The ``to_csv`` function is available from Apache Spark ``3.0.0`` onwards. + + Parameters + ---------- + column : str | Column + The name of the column or the Column object containing the data to serialize to CSV. + + Returns + ------- + Column + A new Column object with data serialized from Spark SQL structures to CSV string. + + Examples + -------- + .. code:: python + + from pyspark.sql import SparkSession + from pyspark.sql.functions import struct + + spark = SparkSession.builder.appName("CSVSerializationExample").getOrCreate() + csv = CSV() + df = spark.createDataFrame([(123, "John")], ["id", "name"]) + df = df.withColumn("combined", struct("id", "name")) + + serialized_df = df.select(csv.serialize_column("combined")) + serialized_df.show() + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col, to_csv + + self.check_if_supported(SparkSession._instantiatedSession) # noqa: WPS437 + self._check_unsupported_serialization_options() + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column) + + return to_csv(column, self.dict()).alias(column_name) + + def _check_unsupported_serialization_options(self): + unsupported_options = ["header", "compression", "inferSchema"] + for option in unsupported_options: + if self.dict().get(option): + warnings.warn( + f"Option `{option}` is set but not supported in `CSV.parse_column` or `CSV.serialize_column`. " + "This may lead to unexpected behavior.", + UserWarning, + stacklevel=2, + ) diff --git a/onetl/file/format/json.py b/onetl/file/format/json.py index 56cdc7f1f..029a5d288 100644 --- a/onetl/file/format/json.py +++ b/onetl/file/format/json.py @@ -136,7 +136,7 @@ def parse_column(self, column: str | Column, schema: StructType | ArrayType | Ma [StructField("id", IntegerType()), StructField("name", StringType())] ) - parsed_df = df.withColumn("parsed_json", json.parse_column("json_string", schema)) + parsed_df = df.select(json.parse_column("json_string", schema)) parsed_df.show() """ from pyspark.sql import Column, SparkSession # noqa: WPS442 @@ -177,7 +177,7 @@ def serialize_column(self, column: str | Column) -> Column: df = spark.createDataFrame([(123, "John")], ["id", "name"]) df = df.withColumn("combined", struct("id", "name")) - serialized_df = df.withColumn("json_string", json.serialize_column("combined")) + serialized_df = df.select(json.serialize_column("combined")) serialized_df.show() """ from pyspark.sql import Column, SparkSession # noqa: WPS442 diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py index 56cb8f052..760f4b114 100644 --- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py @@ -12,7 +12,9 @@ from onetl.file.format import CSV try: - from pyspark.sql.functions import col + from pyspark.sql import Row + from pyspark.sql.functions import col, struct + from pyspark.sql.types import IntegerType, StringType, StructField, StructType from tests.util.assert_df import assert_equal_df from tests.util.spark_df import reset_column_names @@ -133,3 +135,89 @@ def test_csv_writer_with_options( assert read_df.count() assert read_df.schema == df.schema assert_equal_df(read_df, df, order_by="id") + + +@pytest.mark.parametrize( + "csv_string, schema, options, expected", + [ + ( + "1,Anne", + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ",", "header": False}, + Row(id=1, name="Anne"), + ), + ( + "1;Anne", + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ";", "header": False}, + Row(id=1, name="Anne"), + ), + ( + '"1","Anne"', + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ",", "quote": '"', "header": False}, + Row(id=1, name="Anne"), + ), + ], + ids=["comma-delimited", "semicolon-delimited", "quoted-comma-delimited"], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_csv_parse_column(spark, csv_string, schema, options, expected, column_type): + csv_handler = CSV(**options) + df = spark.createDataFrame([(csv_string,)], ["csv_string"]) + parsed_df = df.select(csv_handler.parse_column(column_type("csv_string"), schema)) + assert parsed_df.columns == ["csv_string"] + assert parsed_df.first()["csv_string"] == expected + + +@pytest.mark.parametrize( + "data, schema, options, expected_csv", + [ + ( + Row(id=1, name="Alice"), + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ","}, + "1,Alice", + ), + ( + Row(id=1, name="Alice"), + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ";"}, + "1;Alice", + ), + ], + ids=["comma-delimited", "semicolon-delimited"], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_csv_serialize_column(spark, data, schema, options, expected_csv, column_type): + csv_handler = CSV(**options) + df = spark.createDataFrame([data], schema) + df = df.withColumn("csv_column", struct("id", "name")) + serialized_df = df.select(csv_handler.serialize_column(column_type("csv_column"))) + assert serialized_df.columns == ["csv_column"] + assert serialized_df.first()["csv_column"] == expected_csv + + +@pytest.mark.parametrize( + "options", + [ + ({"header": True}), + ({"compression": "gzip"}), + ({"inferSchema": True}), + ], + ids=["with-header", "with-compression", "with-inferSchema"], +) +def test_csv_unsupported_options_warning(spark, options): + schema = StructType([StructField("id", IntegerType()), StructField("name", StringType())]) + df = spark.createDataFrame([Row(id=1, name="Alice")], schema) + df = df.withColumn("csv_column", struct("id", "name")) + + csv_handler = CSV(**options) + msg = ( + f"Option `{list(options.keys())[0]}` is set but not supported in `CSV.parse_column` or `CSV.serialize_column`." + ) + + with pytest.warns(UserWarning) as record: + df.select(csv_handler.serialize_column("csv_column")).collect() + assert record + assert msg in str(record[0].message) From c5a9dea818a677fe123475557989d4b4adfb55d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 15:09:34 +0000 Subject: [PATCH 28/71] Fix building documentation --- requirements/docs.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/docs.txt b/requirements/docs.txt index 3776dbb09..f86c2207b 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -3,7 +3,8 @@ furo importlib-resources<6 numpydoc pygments-csv-lexer -sphinx +# TODO: remove version limit after https://github.com/pradyunsg/furo/pull/783 +sphinx<7.3 sphinx-copybutton sphinx-design sphinx-favicon From 22346537e93d7063c28d7d07da8b173da8791528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 16:10:26 +0000 Subject: [PATCH 29/71] [DOP-13840] Fix JSON.parse_column and CSV.parse_column on Spark 2.x --- .github/workflows/data/local-fs/tracked.txt | 16 ++++++ onetl/file/format/csv.py | 49 ++++++++++++++----- onetl/file/format/json.py | 7 ++- .../test_csv_integration.py | 45 ++++++++++++++--- .../test_json_integration.py | 8 +++ .../test_kafka_reader_integration.py | 2 +- .../test_format_unit/test_avro_unit.py | 2 + .../test_format_unit/test_csv_unit.py | 2 + .../test_format_unit/test_excel_unit.py | 2 + .../test_format_unit/test_json_unit.py | 2 + .../test_format_unit/test_jsonline_unit.py | 2 + .../test_format_unit/test_orc_unit.py | 2 + .../test_format_unit/test_parquet_unit.py | 2 + .../test_format_unit/test_xml_unit.py | 9 +--- 14 files changed, 122 insertions(+), 28 deletions(-) diff --git a/.github/workflows/data/local-fs/tracked.txt b/.github/workflows/data/local-fs/tracked.txt index bb8d4c276..a61170172 100644 --- a/.github/workflows/data/local-fs/tracked.txt +++ b/.github/workflows/data/local-fs/tracked.txt @@ -2,3 +2,19 @@ **/*local-fs* **/*local_fs*/** **/*local-fs*/** +**/*csv* +**/*csv*/** +**/*json* +**/*json*/** +**/*xml* +**/*xml*/** +**/*excel* +**/*excel*/** +**/*avro* +**/*avro*/** +**/*orc* +**/*orc*/** +**/*parquet* +**/*parquet*/** +**/*file_format* +**/*file_format*/** diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index 153c8923d..8b8f1922b 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -10,6 +10,8 @@ except (ImportError, AttributeError): from pydantic import Field # type: ignore[no-redef, assignment] +from onetl._internal import stringify +from onetl._util.spark import get_spark_version from onetl.file.format.file_format import ReadWriteFileFormat from onetl.hooks import slot, support_hooks @@ -114,10 +116,12 @@ def check_if_supported(cls, spark: SparkSession) -> None: def parse_column(self, column: str | Column, schema: StructType) -> Column: """ - Parses a CSV string column to a structured Spark SQL column using Spark's `from_csv `_ function, based on the provided schema. + Parses a CSV string column to a structured Spark SQL column using Spark's + `from_csv `_ function, based on the provided schema. .. note:: - The ``from_csv`` function is available from Apache Spark ``3.0.0`` onwards. + + Can be used only with Spark 3.x+ Parameters ---------- @@ -149,26 +153,33 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: parsed_df = df.select(csv.parse_column("csv_string", schema)) parsed_df.show() """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 - from pyspark.sql.functions import col, from_csv - self.check_if_supported(SparkSession._instantiatedSession) # noqa: WPS437 + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + self._check_spark_version_for_serialization(spark) self._check_unsupported_serialization_options() + from pyspark.sql.functions import col, from_csv + if isinstance(column, Column): column_name = column._jc.toString() # noqa: WPS437 else: column_name, column = column, col(column).cast("string") schema_string = schema.simpleString() - return from_csv(column, schema_string, self.dict()).alias(column_name) + options = stringify(self.dict(by_alias=True)) + return from_csv(column, schema_string, options).alias(column_name) def serialize_column(self, column: str | Column) -> Column: """ - Serializes a structured Spark SQL column into a CSV string column using Spark's `to_csv `_ function. + Serializes a structured Spark SQL column into a CSV string column using Spark's + `to_csv `_ function. .. note:: - The ``to_csv`` function is available from Apache Spark ``3.0.0`` onwards. + + Can be used only with Spark 3.x+ Parameters ---------- @@ -195,23 +206,39 @@ def serialize_column(self, column: str | Column) -> Column: serialized_df = df.select(csv.serialize_column("combined")) serialized_df.show() """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 - from pyspark.sql.functions import col, to_csv - self.check_if_supported(SparkSession._instantiatedSession) # noqa: WPS437 + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + self._check_spark_version_for_serialization(spark) self._check_unsupported_serialization_options() + from pyspark.sql.functions import col, to_csv + if isinstance(column, Column): column_name = column._jc.toString() # noqa: WPS437 else: column_name, column = column, col(column) - return to_csv(column, self.dict()).alias(column_name) + options = stringify(self.dict(by_alias=True)) + return to_csv(column, options).alias(column_name) + + def _check_spark_version_for_serialization(self, spark: SparkSession): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + class_name = self.__class__.__name__ + error_msg = ( + f"`{class_name}.parse_column` or `{class_name}.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + raise ValueError(error_msg) def _check_unsupported_serialization_options(self): unsupported_options = ["header", "compression", "inferSchema"] + current_options = self.dict() for option in unsupported_options: - if self.dict().get(option): + if current_options.get(option): warnings.warn( f"Option `{option}` is set but not supported in `CSV.parse_column` or `CSV.serialize_column`. " "This may lead to unexpected behavior.", diff --git a/onetl/file/format/json.py b/onetl/file/format/json.py index 029a5d288..455cf29a0 100644 --- a/onetl/file/format/json.py +++ b/onetl/file/format/json.py @@ -6,6 +6,7 @@ from typing_extensions import Literal +from onetl._internal import stringify from onetl.file.format.file_format import ReadOnlyFileFormat from onetl.hooks import slot, support_hooks @@ -149,7 +150,8 @@ def parse_column(self, column: str | Column, schema: StructType | ArrayType | Ma else: column_name, column = column, col(column).cast("string") - return from_json(column, schema, self.dict()).alias(column_name) + options = stringify(self.dict(by_alias=True)) + return from_json(column, schema, options).alias(column_name) def serialize_column(self, column: str | Column) -> Column: """ @@ -190,4 +192,5 @@ def serialize_column(self, column: str | Column) -> Column: else: column_name, column = column, col(column) - return to_json(column, self.dict()).alias(column_name) + options = stringify(self.dict(by_alias=True)) + return to_json(column, options).alias(column_name) diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py index 760f4b114..7d0983a9b 100644 --- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py @@ -4,6 +4,9 @@ Do not test all the possible options and combinations, we are not testing Spark here. """ +import contextlib +import re + import pytest from onetl._util.spark import get_spark_version @@ -163,11 +166,23 @@ def test_csv_writer_with_options( ) @pytest.mark.parametrize("column_type", [str, col]) def test_csv_parse_column(spark, csv_string, schema, options, expected, column_type): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + msg = ( + f"`CSV.parse_column` or `CSV.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=re.escape(msg)) + else: + context_manager = contextlib.nullcontext() + csv_handler = CSV(**options) df = spark.createDataFrame([(csv_string,)], ["csv_string"]) - parsed_df = df.select(csv_handler.parse_column(column_type("csv_string"), schema)) - assert parsed_df.columns == ["csv_string"] - assert parsed_df.first()["csv_string"] == expected + + with context_manager: + parsed_df = df.select(csv_handler.parse_column(column_type("csv_string"), schema)) + assert parsed_df.columns == ["csv_string"] + assert parsed_df.first()["csv_string"] == expected @pytest.mark.parametrize( @@ -190,12 +205,24 @@ def test_csv_parse_column(spark, csv_string, schema, options, expected, column_t ) @pytest.mark.parametrize("column_type", [str, col]) def test_csv_serialize_column(spark, data, schema, options, expected_csv, column_type): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + msg = ( + f"`CSV.parse_column` or `CSV.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=re.escape(msg)) + else: + context_manager = contextlib.nullcontext() + csv_handler = CSV(**options) df = spark.createDataFrame([data], schema) df = df.withColumn("csv_column", struct("id", "name")) - serialized_df = df.select(csv_handler.serialize_column(column_type("csv_column"))) - assert serialized_df.columns == ["csv_column"] - assert serialized_df.first()["csv_column"] == expected_csv + + with context_manager: + serialized_df = df.select(csv_handler.serialize_column(column_type("csv_column"))) + assert serialized_df.columns == ["csv_column"] + assert serialized_df.first()["csv_column"] == expected_csv @pytest.mark.parametrize( @@ -207,7 +234,11 @@ def test_csv_serialize_column(spark, data, schema, options, expected_csv, column ], ids=["with-header", "with-compression", "with-inferSchema"], ) -def test_csv_unsupported_options_warning(spark, options): +def test_csv_serialize_column_unsupported_options_warning(spark, options): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + pytest.skip("CSV.serialize_column in supported on Spark 3.x only") + schema = StructType([StructField("id", IntegerType()), StructField("name", StringType())]) df = spark.createDataFrame([Row(id=1, name="Alice")], schema) df = df.withColumn("csv_column", struct("id", "name")) diff --git a/tests/tests_integration/test_file_format_integration/test_json_integration.py b/tests/tests_integration/test_file_format_integration/test_json_integration.py index 9bb477c90..71bc4e5cc 100644 --- a/tests/tests_integration/test_file_format_integration/test_json_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_json_integration.py @@ -6,6 +6,8 @@ import pytest +from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import JSON @@ -97,9 +99,14 @@ def test_json_writer_is_not_supported( {"key1": "value1", "key2": "value2"}, ), ], + ids=["struct", "map", "array"], ) @pytest.mark.parametrize("column_type", [str, col]) def test_json_parse_column(spark, json_string, schema, expected, column_type): + spark_version = get_spark_version(spark) + if spark_version < Version("2.4") and isinstance(schema, MapType): + pytest.skip("JSON.parse_column accepts MapType only in Spark 2.4+") + json = JSON() df = spark.createDataFrame([(json_string,)], ["json_column"]) parsed_df = df.select(json.parse_column(column_type("json_column"), schema)) @@ -126,6 +133,7 @@ def test_json_parse_column(spark, json_string, schema, expected, column_type): '{"key1":"value1","key2":"value2"}', ), ], + ids=["struct", "array", "map"], ) @pytest.mark.parametrize("column_type", [str, col]) def test_json_serialize_column(spark, data, schema, expected_json, column_type): diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py index 86f2b7e98..9c6e3db57 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py @@ -163,7 +163,7 @@ def test_kafka_reader_topic_does_not_exist(spark, processing): @pytest.mark.parametrize("group_id_option", ["group.id", "groupIdPrefix"]) def test_kafka_reader_with_group_id(group_id_option, spark, processing, kafka_dataframe_schema, kafka_topic): if get_spark_version(spark).major < 3: - pytest.skip("Spark 3.x or later is required to pas group.id") + pytest.skip("Spark 3.x or later is required to pass group.id") first_span = processing.create_pandas_df(min_id=0, max_id=100) processing.insert_pandas_df_into_topic(first_span, kafka_topic) diff --git a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py index 081ed478f..3c2ef1605 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import Avro +pytestmark = [pytest.mark.avro] + @pytest.mark.parametrize( "spark_version", diff --git a/tests/tests_unit/test_file/test_format_unit/test_csv_unit.py b/tests/tests_unit/test_file/test_format_unit/test_csv_unit.py index 125920f18..34b366c71 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_csv_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_csv_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import CSV +pytestmark = [pytest.mark.csv] + def test_csv_options_default(): csv = CSV() diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py index c99b375fd..95dae3da1 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import Excel +pytestmark = [pytest.mark.excel] + @pytest.mark.parametrize( "spark_version", diff --git a/tests/tests_unit/test_file/test_format_unit/test_json_unit.py b/tests/tests_unit/test_file/test_format_unit/test_json_unit.py index 1b3b4527a..702063ea8 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_json_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_json_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import JSON +pytestmark = [pytest.mark.json] + def test_json_options_default(): json = JSON() diff --git a/tests/tests_unit/test_file/test_format_unit/test_jsonline_unit.py b/tests/tests_unit/test_file/test_format_unit/test_jsonline_unit.py index 9b2e361cd..65199adbb 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_jsonline_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_jsonline_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import JSONLine +pytestmark = [pytest.mark.json] + def test_jsonline_options_default(): jsonline = JSONLine() diff --git a/tests/tests_unit/test_file/test_format_unit/test_orc_unit.py b/tests/tests_unit/test_file/test_format_unit/test_orc_unit.py index ea3d5a2a8..60910f045 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_orc_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_orc_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import ORC +pytestmark = [pytest.mark.orc] + @pytest.mark.parametrize( "known_option", diff --git a/tests/tests_unit/test_file/test_format_unit/test_parquet_unit.py b/tests/tests_unit/test_file/test_format_unit/test_parquet_unit.py index ef83e07b0..e332f50d9 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_parquet_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_parquet_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import Parquet +pytestmark = [pytest.mark.parquet] + @pytest.mark.parametrize( "known_option", diff --git a/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py b/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py index bd98c0ff5..d2b6c69d3 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import XML +pytestmark = [pytest.mark.xml] + @pytest.mark.parametrize( "spark_version, scala_version, package_version, expected_packages", @@ -121,10 +123,3 @@ def test_xml_options_unknown(caplog): xml = XML(row_tag="item", unknownOption="abc") assert xml.unknownOption == "abc" assert "Options ['unknownOption'] are not known by XML, are you sure they are valid?" in caplog.text - - -@pytest.mark.local_fs -def test_xml_missing_package(spark_no_packages): - msg = "Cannot import Java class 'com.databricks.spark.xml.XmlReader'" - with pytest.raises(ValueError, match=msg): - XML(row_tag="item").check_if_supported(spark_no_packages) From ca77c920c301358a8119ccf50bb57e1d72f7572c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 15:41:43 +0000 Subject: [PATCH 30/71] [DOP-15511] Update XML package to 0.18.0 --- docs/changelog/next_release/259.feature.rst | 1 + onetl/file/format/xml.py | 10 +++++----- .../test_format_unit/test_xml_unit.py | 19 ++++++++++--------- 3 files changed, 16 insertions(+), 14 deletions(-) create mode 100644 docs/changelog/next_release/259.feature.rst diff --git a/docs/changelog/next_release/259.feature.rst b/docs/changelog/next_release/259.feature.rst new file mode 100644 index 000000000..8934a32ec --- /dev/null +++ b/docs/changelog/next_release/259.feature.rst @@ -0,0 +1 @@ +Update ``XML`` package from 0.17.0 to 0.18.0. diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 2c3a92cdf..678d99e37 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -159,7 +159,7 @@ def get_packages( # noqa: WPS231 If ``None``, ``spark_version`` is used to determine Scala version. package_version : str, optional - Package version in format ``major.minor.patch``. Default is ``0.17.0``. + Package version in format ``major.minor.patch``. Default is ``0.18.0``. See `Maven index `_ for list of available versions. @@ -185,7 +185,7 @@ def get_packages( # noqa: WPS231 XML.get_packages( spark_version="3.5.0", scala_version="2.12", - package_version="0.17.0", + package_version="0.18.0", ) """ @@ -196,7 +196,7 @@ def get_packages( # noqa: WPS231 raise ValueError(f"Package version must be above 0.13, got {version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version("0.17.0").min_digits(3) + version = Version("0.18.0").min_digits(3) spark_ver = Version(spark_version) scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) @@ -205,8 +205,8 @@ def get_packages( # noqa: WPS231 if spark_ver < Version("3.0"): raise ValueError(f"Spark version must be 3.x, got {spark_ver}") - if scala_ver < Version("2.12") or scala_ver > Version("2.13"): - raise ValueError(f"Scala version must be 2.12 or 2.13, got {scala_ver.format('{0}.{1}')}") + if scala_ver < Version("2.12"): + raise ValueError(f"Scala version must be at least 2.12, got {scala_ver.format('{0}.{1}')}") return [f"com.databricks:spark-xml_{scala_ver.format('{0}.{1}')}:{version}"] diff --git a/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py b/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py index d2b6c69d3..b7ee748da 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py @@ -10,18 +10,19 @@ @pytest.mark.parametrize( "spark_version, scala_version, package_version, expected_packages", [ - ("3.2.4", None, None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.2.4", None, None, ["com.databricks:spark-xml_2.12:0.18.0"]), ("3.4.1", "2.12", "0.18.0", ["com.databricks:spark-xml_2.12:0.18.0"]), - ("3.0.0", None, None, ["com.databricks:spark-xml_2.12:0.17.0"]), - ("3.0.0", "2.12", "0.17.0", ["com.databricks:spark-xml_2.12:0.17.0"]), - ("3.1.2", None, None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.0.0", None, None, ["com.databricks:spark-xml_2.12:0.18.0"]), + ("3.0.0", "2.12", "0.18.0", ["com.databricks:spark-xml_2.12:0.18.0"]), + ("3.1.2", None, None, ["com.databricks:spark-xml_2.12:0.18.0"]), ("3.1.2", "2.12", "0.16.0", ["com.databricks:spark-xml_2.12:0.16.0"]), - ("3.2.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.2.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.18.0"]), ("3.2.0", "2.12", "0.15.0", ["com.databricks:spark-xml_2.12:0.15.0"]), - ("3.2.4", "2.13", None, ["com.databricks:spark-xml_2.13:0.17.0"]), + ("3.2.4", "2.13", None, ["com.databricks:spark-xml_2.13:0.18.0"]), ("3.4.1", "2.13", "0.18.0", ["com.databricks:spark-xml_2.13:0.18.0"]), + ("3.4.1", "3.0", "0.18.0", ["com.databricks:spark-xml_3.0:0.18.0"]), ("3.3.0", None, "0.16.0", ["com.databricks:spark-xml_2.12:0.16.0"]), - ("3.3.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.3.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.18.0"]), ("3.2.4", "2.12.1", "0.15.0", ["com.databricks:spark-xml_2.12:0.15.0"]), ], ) @@ -54,11 +55,11 @@ def test_xml_get_packages_restriction_for_spark_2x(spark_version, scala_version, "spark_version, scala_version, package_version", [ ("3.2.4", "2.11", None), - ("3.4.1", "2.14", None), + ("3.4.1", "2.10", None), ], ) def test_xml_get_packages_scala_version_error(spark_version, scala_version, package_version): - with pytest.raises(ValueError, match=r"Scala version must be 2.12 or 2.13, got \d+\.\d+"): + with pytest.raises(ValueError, match=f"Scala version must be at least 2.12, got {scala_version}"): XML.get_packages( spark_version=spark_version, scala_version=scala_version, From 43135acf8e4cd60a55fbb51f3cf0eaf253086188 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 16:42:09 +0000 Subject: [PATCH 31/71] [DOP-15512] Download only packages required by tests --- .github/workflows/test-greenplum.yml | 1 - CONTRIBUTING.rst | 1 - onetl/_util/version.py | 13 ++-- .../db_connection/greenplum/connection.py | 6 +- .../db_connection/kafka/connection.py | 4 +- .../file_df_connection/spark_s3/connection.py | 4 +- onetl/file/format/avro.py | 4 +- onetl/file/format/excel.py | 6 +- onetl/file/format/xml.py | 6 +- pytest.ini | 7 ++ tests/fixtures/spark.py | 70 ++++++++++++------- .../test_avro_integration.py | 2 +- .../test_csv_integration.py | 2 +- .../test_excel_integration.py | 2 +- .../test_json_integration.py | 2 +- .../test_jsonline_integration.py | 2 +- .../test_orc_integration.py | 2 +- .../test_parquet_integration.py | 2 +- .../test_xml_integration.py | 2 +- 19 files changed, 79 insertions(+), 59 deletions(-) diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index bf5e5012e..440ab0b6c 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -113,7 +113,6 @@ jobs: export ONETL_GP_PACKAGE_VERSION=${{ inputs.package-version }} ./pytest_runner.sh -m greenplum env: - ONETL_DB_WITH_GREENPLUM: 'true' GREENPLUM_PACKAGES_USER: ${{ secrets.GREENPLUM_PACKAGES_USER }} GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index e347a6ecf..3d132ddb5 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -176,7 +176,6 @@ Without docker-compose * Download `VMware Greenplum connector for Spark `_ * Either move it to ``~/.ivy2/jars/``, or pass file path to ``CLASSPATH`` - * Set environment variable ``ONETL_DB_WITH_GREENPLUM=true`` to enable adding connector to Spark session Start all containers with dependencies: diff --git a/onetl/_util/version.py b/onetl/_util/version.py index 24b924da2..85bde1c7a 100644 --- a/onetl/_util/version.py +++ b/onetl/_util/version.py @@ -25,15 +25,10 @@ class Version: """ - def __init__(self, version: Version | str): - if isinstance(version, Version): - self._raw_str: str = version._raw_str - self._raw_parts: list[str] = version._raw_parts.copy() - self._numeric_parts: list[int] = version._numeric_parts.copy() - else: - self._raw_str = version - self._raw_parts = re.split("[.-]", version) - self._numeric_parts = [int(part) for part in self._raw_parts if part.isdigit()] + def __init__(self, version: str): + self._raw_str = version + self._raw_parts = re.split("[.-]", version) + self._numeric_parts = [int(part) for part in self._raw_parts if part.isdigit()] @property def major(self) -> int: diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 6744816f8..c3e6b3b9f 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -164,9 +164,9 @@ class Greenplum(JDBCMixin, DBConnection): def get_packages( cls, *, - scala_version: str | Version | None = None, - spark_version: str | Version | None = None, - package_version: str | Version | None = None, + scala_version: str | None = None, + spark_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index ddc6637f5..dd834495b 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -392,8 +392,8 @@ def get_df_schema( @classmethod def get_packages( cls, - spark_version: str | Version, - scala_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 44b1e2355..12babf694 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -232,8 +232,8 @@ class SparkS3(SparkFileDFConnection): @classmethod def get_packages( cls, - spark_version: str | Version, - scala_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 94fad71dd..4242a3873 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -117,8 +117,8 @@ class Config: @classmethod def get_packages( cls, - spark_version: str | Version, - scala_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index ffbaf9173..a3ea7a015 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -114,9 +114,9 @@ class Config: @classmethod def get_packages( cls, - spark_version: str | Version, - scala_version: str | Version | None = None, - package_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 678d99e37..83c02329b 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -141,9 +141,9 @@ class Config: @classmethod def get_packages( # noqa: WPS231 cls, - spark_version: str | Version, - scala_version: str | Version | None = None, - package_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| diff --git a/pytest.ini b/pytest.ini index 3c71e8eb6..a1143d118 100644 --- a/pytest.ini +++ b/pytest.ini @@ -27,3 +27,10 @@ markers = samba: Samba tests teradata: Teradata tests webdav: WebDAV tests + csv: CSV tests + json: JSON tests + orc: ORC tests + parquet: Parquet tests + xml: XML tests + avro: Avro tests + excel: Excel tests diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index 241f87fc3..7483c3ccf 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -32,7 +32,7 @@ def ivysettings_path(): @pytest.fixture(scope="session") -def maven_packages(): +def maven_packages(request): from onetl.connection import ( MSSQL, Clickhouse, @@ -48,43 +48,63 @@ def maven_packages(): from onetl.file.format import XML, Avro, Excel pyspark_version = get_pyspark_version() - packages = ( - Clickhouse.get_packages() - + MSSQL.get_packages() - + MySQL.get_packages() - + Oracle.get_packages() - + Postgres.get_packages() - + Teradata.get_packages() - ) + packages: list[str] = [] + + # get markers from all downstream tests + markers = set() + for func in request.session.items: + markers.update(marker.name for marker in func.iter_markers()) + + if "clickhouse" in markers: + packages.extend(Clickhouse.get_packages()) + + if "mssql" in markers: + packages.extend(MSSQL.get_packages()) + + if "mysql" in markers: + packages.extend(MySQL.get_packages()) + + if "oracle" in markers: + packages.extend(Oracle.get_packages()) + + if "postgres" in markers: + packages.extend(Postgres.get_packages()) + + if "teradata" in markers: + packages.extend(Teradata.get_packages()) - with_greenplum = os.getenv("ONETL_DB_WITH_GREENPLUM", "false").lower() == "true" - if with_greenplum: - # Greenplum connector jar is not publicly available, + if "greenplum" in markers: packages.extend( Greenplum.get_packages( - spark_version=pyspark_version, + spark_version=str(pyspark_version), package_version=os.getenv("ONETL_GP_PACKAGE_VERSION") or None, ), ) if pyspark_version >= Version("2.4"): - # There is no Avro package for Spark 2.3 - packages.extend(Avro.get_packages(spark_version=pyspark_version)) - # Kafka connector for Spark 2.3 is too old and not supported - packages.extend(Kafka.get_packages(spark_version=pyspark_version)) + if "avro" in markers: + # There is no Avro package for Spark 2.3 + packages.extend(Avro.get_packages(spark_version=str(pyspark_version))) + if "kafka" in markers: + # Kafka connector for Spark 2.3 is too old and not supported + packages.extend(Kafka.get_packages(spark_version=str(pyspark_version))) if pyspark_version >= Version("3.2"): - # There is no SparkS3 connector for Spark less than 3 - packages.extend(SparkS3.get_packages(spark_version=pyspark_version)) + if "s3" in markers: + # There is no SparkS3 connector for Spark less than 3 + packages.extend(SparkS3.get_packages(spark_version=str(pyspark_version))) - # There is no XML files support for Spark less than 3 - packages.extend(XML.get_packages(spark_version=pyspark_version)) + if "xml" in markers: + # There is no XML files support for Spark less than 3 + packages.extend(XML.get_packages(spark_version=str(pyspark_version))) - # There is no MongoDB connector for Spark less than 3.2 - packages.extend(MongoDB.get_packages(spark_version=pyspark_version)) + if "mongodb" in markers: + # There is no MongoDB connector for Spark less than 3.2 + packages.extend(MongoDB.get_packages(spark_version=str(pyspark_version))) - # There is no Excel files support for Spark less than 3.2 - packages.extend(Excel.get_packages(spark_version=pyspark_version)) + if "excel" in markers: + # There is no Excel files support for Spark less than 3.2 + packages.extend(Excel.get_packages(spark_version=str(pyspark_version))) return packages diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py index d9ada7bda..594de40ba 100644 --- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py @@ -16,7 +16,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.avro] @pytest.fixture() diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py index 7d0983a9b..bae6f1d4c 100644 --- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py @@ -24,7 +24,7 @@ except ImportError: pytest.skip("Missing pandas or pyspark", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.csv] def test_csv_reader_with_infer_schema( diff --git a/tests/tests_integration/test_file_format_integration/test_excel_integration.py b/tests/tests_integration/test_file_format_integration/test_excel_integration.py index 8344de1db..890d9a9b6 100644 --- a/tests/tests_integration/test_file_format_integration/test_excel_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_excel_integration.py @@ -19,7 +19,7 @@ except ImportError: pytest.skip("Missing pandas or pyspark", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.excel] @pytest.mark.parametrize("format", ["xlsx", "xls"]) diff --git a/tests/tests_integration/test_file_format_integration/test_json_integration.py b/tests/tests_integration/test_file_format_integration/test_json_integration.py index 71bc4e5cc..dcdbbc039 100644 --- a/tests/tests_integration/test_file_format_integration/test_json_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_json_integration.py @@ -30,7 +30,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.json] @pytest.mark.parametrize( diff --git a/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py b/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py index f4678e17d..a246843e4 100644 --- a/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py @@ -14,7 +14,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.json] @pytest.mark.parametrize( diff --git a/tests/tests_integration/test_file_format_integration/test_orc_integration.py b/tests/tests_integration/test_file_format_integration/test_orc_integration.py index a848f0f25..40902cef9 100644 --- a/tests/tests_integration/test_file_format_integration/test_orc_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_orc_integration.py @@ -14,7 +14,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.orc] @pytest.mark.parametrize( diff --git a/tests/tests_integration/test_file_format_integration/test_parquet_integration.py b/tests/tests_integration/test_file_format_integration/test_parquet_integration.py index 41d492c43..ea5844eb1 100644 --- a/tests/tests_integration/test_file_format_integration/test_parquet_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_parquet_integration.py @@ -14,7 +14,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.parquet] @pytest.mark.parametrize( diff --git a/tests/tests_integration/test_file_format_integration/test_xml_integration.py b/tests/tests_integration/test_file_format_integration/test_xml_integration.py index 81aade061..705d7ff84 100644 --- a/tests/tests_integration/test_file_format_integration/test_xml_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_xml_integration.py @@ -15,7 +15,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.xml] @pytest.fixture() From 9ff5385506436f7cd65e940410efd8457444b9dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 18:41:33 +0000 Subject: [PATCH 32/71] [DOP-15512] Reduce size of Github CI caches --- .github/workflows/test-clickhouse.yml | 6 +++--- .github/workflows/test-greenplum.yml | 6 +++--- .github/workflows/test-hdfs.yml | 6 +++--- .github/workflows/test-hive.yml | 6 +++--- .github/workflows/test-kafka.yml | 6 +++--- .github/workflows/test-local-fs.yml | 6 +++--- .github/workflows/test-mongodb.yml | 6 +++--- .github/workflows/test-mssql.yml | 6 +++--- .github/workflows/test-mysql.yml | 6 +++--- .github/workflows/test-oracle.yml | 6 +++--- .github/workflows/test-postgres.yml | 6 +++--- .github/workflows/test-s3.yml | 6 +++--- .github/workflows/test-teradata.yml | 7 ++++--- 13 files changed, 40 insertions(+), 39 deletions(-) diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index 4a29f84cb..6c790cbc5 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -71,10 +71,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index 440ab0b6c..d54e96970 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -76,10 +76,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Set up Postgres client if: runner.os == 'Linux' diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index e97ee6249..6b4bf7849 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -66,10 +66,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-hive.yml b/.github/workflows/test-hive.yml index 446cb76d0..893348ab6 100644 --- a/.github/workflows/test-hive.yml +++ b/.github/workflows/test-hive.yml @@ -57,10 +57,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index d1389049f..120ac3a40 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -104,10 +104,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-local-fs.yml b/.github/workflows/test-local-fs.yml index 529deef41..f4b37c45a 100644 --- a/.github/workflows/test-local-fs.yml +++ b/.github/workflows/test-local-fs.yml @@ -57,10 +57,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index c842ce7af..ea230132f 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -69,10 +69,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index efc8edce8..23d315a93 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -72,10 +72,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index c98562dd4..66bda2e10 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -71,10 +71,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index c22bc9553..2438fce1d 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -77,10 +77,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Set up Oracle instantclient if: runner.os == 'Linux' diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index 7ac821bc1..87fd34731 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -70,10 +70,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index a75297fe3..96775f3bf 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -71,10 +71,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-teradata.yml b/.github/workflows/test-teradata.yml index 214859e2b..b348da5f2 100644 --- a/.github/workflows/test-teradata.yml +++ b/.github/workflows/test-teradata.yml @@ -54,12 +54,13 @@ jobs: - name: Cache pip uses: actions/cache@v4 + if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel From bd4197970310287608d82e5cf9340fe8f991f283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 18:55:02 +0000 Subject: [PATCH 33/71] [DOP-15512] Reduce size of Github CI caches --- .github/workflows/codeql-analysis.yml | 2 -- .github/workflows/test-core.yml | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index f3397b706..a1e1e7ef5 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -49,8 +49,6 @@ jobs: restore-keys: | ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql-${{ hashFiles('requirements*.txt') }} ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql- - ${{ runner.os }}-python - ${{ runner.os }}- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index cdd977398..b7a1b3cec 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -57,10 +57,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel From eb075ba7a8af1cb35c431523292b5661478ec50e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 19:06:37 +0000 Subject: [PATCH 34/71] [DOP-15512] Speed up HDFS tests a bit --- .github/workflows/test-hdfs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index 6b4bf7849..e775ed36e 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -70,6 +70,8 @@ jobs: restore-keys: | ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- + # add krb5 to cache, but not in feature/bugfix branches + save-always: ${{ github.ref_name == 'develop' }} - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel From bcd9bcdc8bf2a3a2647f2b203e0797a2538c1e5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 19:22:38 +0000 Subject: [PATCH 35/71] [DOP-15512] Speed up HDFS tests a bit --- .github/workflows/test-hdfs.yml | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index e775ed36e..51b90c017 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -51,27 +51,15 @@ jobs: sudo apt-get update sudo apt-get install --no-install-recommends libkrb5-dev gcc - - name: Cache Ivy - uses: actions/cache@v4 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs- - - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - # add krb5 to cache, but not in feature/bugfix branches - save-always: ${{ github.ref_name == 'develop' }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel From 5fde37a74d7193d1e8cc740f5e7951639ca6ce33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 24 Apr 2024 19:26:34 +0000 Subject: [PATCH 36/71] [DOP-15512] Speed up HDFS tests a bit --- .github/workflows/test-hdfs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index 51b90c017..918e4f091 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -59,7 +59,9 @@ jobs: key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel From 8b5202ac53a3556d7f85ca4fb7add2f32ff42256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 26 Apr 2024 07:56:33 +0000 Subject: [PATCH 37/71] [DOP-13855] Update Clickhouse package to 0.6.0-patch4 --- docs/changelog/next_release/249.breaking.rst | 2 +- onetl/connection/db_connection/clickhouse/connection.py | 8 ++++---- tests/fixtures/spark.py | 2 +- .../tests_db_connection_unit/test_clickhouse_unit.py | 9 ++++++--- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/changelog/next_release/249.breaking.rst b/docs/changelog/next_release/249.breaking.rst index 04764a1e6..ca733c81f 100644 --- a/docs/changelog/next_release/249.breaking.rst +++ b/docs/changelog/next_release/249.breaking.rst @@ -1 +1 @@ -Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0 `_. +Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0-patch4 `_. diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 288612f3c..2f22de9bf 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -28,7 +28,7 @@ class Config: class Clickhouse(JDBCConnection): """Clickhouse JDBC connection. |support_hooks| - Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0 `_ + Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0-patch4 `_ (`official Clickhouse JDBC driver `_). .. warning:: @@ -119,7 +119,7 @@ def get_packages( Parameters ---------- package_version : str, optional - ClickHouse JDBC version client packages. Defaults to ``0.6.0``. + ClickHouse JDBC version client packages. Defaults to ``0.6.0-patch4``. apache_http_client_version : str, optional Apache HTTP Client version package. Defaults to ``5.3.1``. @@ -139,7 +139,7 @@ def get_packages( ``com.clickhouse:clickhouse-jdbc:0.6.0:all`` to install all required packages. """ - default_jdbc_version = "0.6.0" + default_jdbc_version = "0.6.0-patch4" default_http_version = "5.3.1" jdbc_version = Version(package_version or default_jdbc_version).min_digits(3) @@ -158,7 +158,7 @@ def get_packages( @classproperty def package(self) -> str: """Get a single string of package names to be downloaded by Spark for establishing a Clickhouse connection.""" - return "com.clickhouse:clickhouse-jdbc:0.6.0,com.clickhouse:clickhouse-http-client:0.6.0,org.apache.httpcomponents.client5:httpclient5:5.3.1" + return "com.clickhouse:clickhouse-jdbc:0.6.0-patch4,com.clickhouse:clickhouse-http-client:0.6.0-patch4,org.apache.httpcomponents.client5:httpclient5:5.3.1" @property def jdbc_url(self) -> str: diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index 7483c3ccf..dbc03ba33 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -48,13 +48,13 @@ def maven_packages(request): from onetl.file.format import XML, Avro, Excel pyspark_version = get_pyspark_version() - packages: list[str] = [] # get markers from all downstream tests markers = set() for func in request.session.items: markers.update(marker.name for marker in func.iter_markers()) + packages: list[str] = [] if "clickhouse" in markers: packages.extend(Clickhouse.get_packages()) diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 6b400b93e..79fc13ddc 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -10,7 +10,10 @@ def test_clickhouse_driver(): def test_clickhouse_package(): - expected_packages = "com.clickhouse:clickhouse-jdbc:0.6.0,com.clickhouse:clickhouse-http-client:0.6.0,org.apache.httpcomponents.client5:httpclient5:5.3.1" + expected_packages = ( + "com.clickhouse:clickhouse-jdbc:0.6.0-patch4,com.clickhouse:clickhouse-http-client:0.6.0-patch4," + "org.apache.httpcomponents.client5:httpclient5:5.3.1" + ) assert Clickhouse.package == expected_packages @@ -21,8 +24,8 @@ def test_clickhouse_package(): None, None, [ - "com.clickhouse:clickhouse-jdbc:0.6.0", - "com.clickhouse:clickhouse-http-client:0.6.0", + "com.clickhouse:clickhouse-jdbc:0.6.0-patch4", + "com.clickhouse:clickhouse-http-client:0.6.0-patch4", "org.apache.httpcomponents.client5:httpclient5:5.3.1", ], ), From 96ce94022ded0a16657a5c8b8cea44a45b3b6817 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 26 Apr 2024 08:19:40 +0000 Subject: [PATCH 38/71] [DOP-13853] Update MongoDB package to 10.2.3 --- docs/changelog/next_release/255.feature.rst | 2 +- .../connection/db_connection/mongodb/types.rst | 4 ++-- .../db_connection/mongodb/connection.py | 16 ++++++++-------- .../test_mongodb_unit.py | 18 +++++++++--------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/changelog/next_release/255.feature.rst b/docs/changelog/next_release/255.feature.rst index 55d780196..aff64c57c 100644 --- a/docs/changelog/next_release/255.feature.rst +++ b/docs/changelog/next_release/255.feature.rst @@ -1 +1 @@ -:class:`MongoDB` connection now uses MongoDB Spark connector ``10.2.2``, upgraded from ``10.1.1``, and supports passing custom versions: ``MongoDB.get_packages(scala_version=..., package_version=...)``. +:class:`MongoDB` connection now uses MongoDB Spark connector ``10.2.3``, upgraded from ``10.1.1``, and supports passing custom versions: ``MongoDB.get_packages(scala_version=..., package_version=...)``. diff --git a/docs/connection/db_connection/mongodb/types.rst b/docs/connection/db_connection/mongodb/types.rst index bd5978aa9..f9787ff2e 100644 --- a/docs/connection/db_connection/mongodb/types.rst +++ b/docs/connection/db_connection/mongodb/types.rst @@ -73,8 +73,8 @@ References Here you can find source code with type conversions: -* `MongoDB -> Spark `_ -* `Spark -> MongoDB `_ +* `MongoDB -> Spark `_ +* `Spark -> MongoDB `_ Supported types --------------- diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 5d4299f39..f2c959295 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -50,8 +50,8 @@ class Config: class MongoDB(DBConnection): """MongoDB connection. |support_hooks| - Based on package ``org.mongodb.spark:mongo-spark-connector:10.1.1`` - (`MongoDB connector for Spark `_) + Based on package `org.mongodb.spark:mongo-spark-connector:10.2.3 `_ + (`MongoDB connector for Spark `_) .. warning:: @@ -149,7 +149,7 @@ def get_packages( Spark version in format ``major.minor``. Used only if ``scala_version=None``. package_version : str, optional - Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.2.2``. + Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.2.3``. Examples -------- @@ -160,10 +160,10 @@ def get_packages( MongoDB.get_packages(scala_version="2.12") # specify custom connector version - MongoDB.get_packages(scala_version="2.12", package_version="10.2.2") + MongoDB.get_packages(scala_version="2.12", package_version="10.2.3") """ - default_package_version = "10.2.2" + default_package_version = "10.2.3" if scala_version: scala_ver = Version(scala_version).min_digits(2) @@ -190,7 +190,7 @@ def package_spark_3_2(cls) -> str: "use `MongoDB.get_packages(spark_version='3.2')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" @classproperty def package_spark_3_3(cls) -> str: @@ -200,7 +200,7 @@ def package_spark_3_3(cls) -> str: "use `MongoDB.get_packages(spark_version='3.3')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" @classproperty def package_spark_3_4(cls) -> str: @@ -210,7 +210,7 @@ def package_spark_3_4(cls) -> str: "use `MongoDB.get_packages(spark_version='3.4')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" @slot def pipeline( diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index 5333617b1..3e5f85215 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -12,9 +12,9 @@ def test_mongodb_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MongoDB.get_packages(spark_version=") with pytest.warns(UserWarning, match=warning_msg): - assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" - assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" - assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2" + assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" + assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" + assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" def test_mongodb_get_packages_no_input(): @@ -50,16 +50,16 @@ def test_mongodb_get_packages_scala_version_not_supported(scala_version): @pytest.mark.parametrize( "spark_version, scala_version, package_version, package", [ - (None, "2.12", "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), - (None, "2.13", "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.13:10.2.2"), - ("3.2", None, "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), - ("3.3", None, "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), - ("3.4", None, "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), + (None, "2.12", "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), + (None, "2.13", "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.13:10.2.3"), + ("3.2", None, "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), + ("3.3", None, "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), + ("3.4", None, "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), ("3.2", "2.12", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), ("3.4", "2.13", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), ("3.2", "2.12", "10.2.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1"), ("3.2", "2.12", "10.2.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0"), - ("3.2.4", "2.12.1", "10.2.2", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.2"), + ("3.2.4", "2.12.1", "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), ], ) def test_mongodb_get_packages(spark_version, scala_version, package_version, package): From 6fc59911d1fd64eecbde4911c9314c242ef3ee18 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Sat, 27 Apr 2024 13:36:38 +0300 Subject: [PATCH 39/71] [DOP-13845] - implement Avro.parse_column, Avro.serialize_column (#265) * [DOP-13845] - implement Avro.parse_column, Avro.serialize_column * [DOP-13845] - add Avro format handling documentation * [DOP-13845] - add _get_schema_json * [DOP-13845] - add avro tests with responses --- docs/changelog/next_release/265.feature.rst | 1 + .../db_connection/kafka/format_handling.rst | 90 +++++++++ docs/file_df/file_formats/avro.rst | 2 +- onetl/file/format/avro.py | 179 +++++++++++++++++- requirements/tests/base.txt | 2 + .../test_avro_integration.py | 131 ++++++++++++- 6 files changed, 398 insertions(+), 7 deletions(-) create mode 100644 docs/changelog/next_release/265.feature.rst diff --git a/docs/changelog/next_release/265.feature.rst b/docs/changelog/next_release/265.feature.rst new file mode 100644 index 000000000..03c39a942 --- /dev/null +++ b/docs/changelog/next_release/265.feature.rst @@ -0,0 +1 @@ +Add ``Avro.parse_column`` and ``Avro.serialize_column`` methods to enhance the handling of Avro binary data within Spark. These methods allow for direct parsing of binary Avro data into structured Spark DataFrame columns and serialization of Spark DataFrame columns back into Avro binary format. diff --git a/docs/connection/db_connection/kafka/format_handling.rst b/docs/connection/db_connection/kafka/format_handling.rst index 74307c3f6..bc1993a20 100644 --- a/docs/connection/db_connection/kafka/format_handling.rst +++ b/docs/connection/db_connection/kafka/format_handling.rst @@ -152,3 +152,93 @@ For serializing data into JSON format and sending it back to Kafka, use the :obj # |{"name":"Alice","age":20}| # |{"name":"Bob","age":25} | # +-------------------------+ + +Avro Format Handling +-------------------- + +``DBReader`` +~~~~~~~~~~~~ + +To process Avro formatted data from Kafka, use the :obj:`Avro.parse_column ` method. This method allows you to convert a column containing Avro binary data directly into a structured Spark DataFrame using a predefined schema. + +.. code-block:: python + + from pyspark.sql import SparkSession + from pyspark.sql.types import StructType, StructField, IntegerType, StringType + + from onetl.db import DBReader + from onetl.file.format import Avro + from onetl.connection import Kafka + + spark = SparkSession.builder.appName("KafkaAvroExample").getOrCreate() + + kafka = Kafka(...) + avro = Avro( + schema_dict={ + "type": "record", + "name": "Person", + "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}], + } + ) + + reader = DBReader( + connection=kafka, + topic="topic_name", + ) + df = reader.run() + + df.show() + # +----+------------------------------------+----------+---------+------+-----------------------+-------------+ + # |key |value |topic |partition|offset|timestamp |timestampType| + # +----+------------------------------------+----------+---------+------+-----------------------+-------------+ + # |[31]|[02 02 02 08 76 6... (binary data)] |topicAvro |0 |0 |2024-04-24 13:02:25.911|0 | + # |[32]|[02 04 02 08 76 6... (binary data)] |topicAvro |0 |1 |2024-04-24 13:02:25.922|0 | + # +----+------------------------------------+----------+---------+------+-----------------------+-------------+ + + parsed_df = df.select(avro.parse_column("value")) + parsed_df.show() + # +-----+----+ + # | name| age| + # +-----+----+ + # |Alice| 20| + # | Bob| 25| + # +-----+----+ + +``DBWriter`` +~~~~~~~~~~~~ + +To serialize structured data into Avro format and write it back to a Kafka topic, use the :obj:`Avro.serialize_column ` method. + +.. code-block:: python + + from onetl.db import DBWriter + from onetl.file.format import Avro + from onetl.connection import Kafka + + kafka = Kafka(...) + avro = Avro( + schema_dict={ + "type": "record", + "name": "Person", + "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}], + } + ) + + df.select("value").show() + # +-----------+ + # |value | + # +-----------+ + # |{Alice, 20}| + # |{Bob, 25} | + # +-----------+ + + # serializing data into Avro format + serialized_df = df.select(avro.serialize_column("value")) + + serialized_df.show() + # +---+------------------------------------+ + # |key|value | + # +---+------------------------------------+ + # | 1|[02 02 02 08 76 6... (binary data)] | + # | 2|[02 04 02 08 76 6... (binary data)] | + # +---+------------------------------------+ diff --git a/docs/file_df/file_formats/avro.rst b/docs/file_df/file_formats/avro.rst index 7f1ec0d4f..1fe9ef781 100644 --- a/docs/file_df/file_formats/avro.rst +++ b/docs/file_df/file_formats/avro.rst @@ -6,4 +6,4 @@ Avro .. currentmodule:: onetl.file.format.avro .. autoclass:: Avro - :members: get_packages + :members: get_packages, parse_column, serialize_column diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 4242a3873..e07c66f51 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -20,7 +20,8 @@ from onetl.hooks import slot, support_hooks if TYPE_CHECKING: - from pyspark.sql import DataFrameReader, DataFrameWriter, SparkSession + from pyspark.sql import Column, DataFrameReader, DataFrameWriter, SparkSession + PROHIBITED_OPTIONS = frozenset( ( @@ -189,8 +190,184 @@ def apply_to_writer(self, writer: DataFrameWriter) -> DataFrameWriter: options["avroSchema"] = json.dumps(self.schema_dict) return writer.format(self.name).options(**options) + def parse_column(self, column: str | Column) -> Column: + """ + Parses an Avro binary column into a structured Spark SQL column using Spark's + `from_avro `_ function, based on the schema provided within the class. + + .. note:: + + Can be used only with Spark 3.x+ + + .. warning:: + + If ``schema_url`` is provided, ``requests`` library is used to fetch the schema from the URL. It should be installed manually, like this: + + .. code:: bash + + pip install requests + + Parameters + ---------- + column : str | Column + The name of the column or the Column object containing Avro binary data to parse. + + Returns + ------- + Column + A new Column object with data parsed from Avro binary to the specified structured format. + + Raises + ------ + ValueError + If the Spark version is less than 3.x or if neither schema_dict nor schema_url is defined. + ImportError + If ``schema_url`` is used and the ``requests`` library is not installed. + + + Examples + -------- + .. code:: python + + from pyspark.sql import SparkSession + + from onetl.file.format import Avro + + spark = SparkSession.builder.appName("AvroParsingExample").getOrCreate() + schema_dict = { + "type": "record", + "name": "Person", + "fields": [{"name": "name", "type": "string"}], + } + avro = Avro(schema_dict=schema_dict) + df = spark.createDataFrame([("bytes_data_here",)], ["avro_data"]) + + parsed_df = df.select(avro.parse_column("avro_data")) + parsed_df.show() + + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col + + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + self._check_spark_version_for_serialization(spark) + + from pyspark.sql.avro.functions import from_avro + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column).cast("binary") + + schema = self._get_schema_json() + if not schema: + raise ValueError("Avro.parse_column can be used only with defined `schema_dict` or `schema_url`") + + return from_avro(column, schema).alias(column_name) + + def serialize_column(self, column: str | Column) -> Column: + """ + Serializes a structured Spark SQL column into an Avro binary column using Spark's + `to_avro `_ function. + + .. note:: + + Can be used only with Spark 3.x+ + + .. warning:: + + If ``schema_url`` is provided, ``requests`` library is used to fetch the schema from the URL. It should be installed manually, like this: + + .. code:: bash + + pip install requests + + Parameters + ---------- + column : str | Column + The name of the column or the Column object containing the data to serialize to Avro format. + + Returns + ------- + Column + A new Column object with data serialized from Spark SQL structures to Avro binary. + + Raises + ------ + ValueError + If the Spark version is less than 3.x. + ImportError + If ``schema_url`` is used and the ``requests`` library is not installed. + + Examples + -------- + .. code:: python + + from pyspark.sql import SparkSession + + from onetl.file.format import Avro + + spark = SparkSession.builder.appName("AvroSerializationExample").getOrCreate() + schema_dict = { + "type": "record", + "name": "Person", + "fields": [{"name": "id", "type": "long"}, {"name": "name", "type": "string"}], + } + + avro = Avro(schema_dict=schema_dict) + df = spark.createDataFrame([(1, "John Doe"), (2, "Jane Doe")], ["id", "name"]) + + serialized_df = df.select(avro.serialize_column("name")) + serialized_df.show() + + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col + + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + self._check_spark_version_for_serialization(spark) + + from pyspark.sql.avro.functions import to_avro + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column) + + schema = self._get_schema_json() + return to_avro(column, schema).alias(column_name) + @validator("schema_dict", pre=True) def _parse_schema_from_json(cls, value): if isinstance(value, (str, bytes)): return json.loads(value) return value + + def _check_spark_version_for_serialization(self, spark: SparkSession): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + class_name = self.__class__.__name__ + error_msg = ( + f"`{class_name}.parse_column` or `{class_name}.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}." + ) + raise ValueError(error_msg) + + def _get_schema_json(self) -> str: + if self.schema_dict: + return json.dumps(self.schema_dict) + elif self.schema_url: + try: + import requests + + response = requests.get(self.schema_url) # noqa: S113 + return response.text + except ImportError as e: + raise ImportError( + "The 'requests' library is required to use 'schema_url' but is not installed. " + "Install it with 'pip install requests' or avoid using 'schema_url'.", + ) from e + else: + return "" diff --git a/requirements/tests/base.txt b/requirements/tests/base.txt index 17e7ef712..b702fca16 100644 --- a/requirements/tests/base.txt +++ b/requirements/tests/base.txt @@ -4,3 +4,5 @@ pytest<8 pytest-lazy-fixture pytest-mock pytest-rerunfailures +requests +responses diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py index 594de40ba..dad5a730c 100644 --- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py @@ -4,7 +4,10 @@ Do not test all the possible options and combinations, we are not testing Spark here. """ +import contextlib + import pytest +import responses from onetl._util.spark import get_spark_version from onetl._util.version import Version @@ -12,9 +15,11 @@ from onetl.file.format import Avro try: + from pyspark.sql.functions import col + from tests.util.assert_df import assert_equal_df except ImportError: - pytest.skip("Missing pandas", allow_module_level=True) + pytest.skip("Missing pandas or pyspark", allow_module_level=True) pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.avro] @@ -30,8 +35,11 @@ def avro_schema(): {"name": "id", "type": ["null", "int"]}, {"name": "str_value", "type": ["null", "string"]}, {"name": "int_value", "type": ["null", "int"]}, - {"name": "date_value", "type": ["null", "int"]}, - {"name": "datetime_value", "type": ["null", "long"]}, + {"name": "date_value", "type": ["null", {"type": "int", "logicalType": "date"}]}, + { + "name": "datetime_value", + "type": ["null", {"type": "long", "logicalType": "timestamp-millis"}], + }, {"name": "float_value", "type": ["null", "double"]}, ], } @@ -56,7 +64,7 @@ def test_avro_reader( """Reading Avro files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) if spark_version < Version("2.4"): - pytest.skip("Avro files are supported on Spark 3.2+ only") + pytest.skip("Avro files are supported on Spark 2.4+ only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files df = file_df_dataframe @@ -93,7 +101,7 @@ def test_avro_writer( """Written files can be read by Spark""" spark_version = get_spark_version(spark) if spark_version < Version("2.4"): - pytest.skip("Avro files are supported on Spark 3.2+ only") + pytest.skip("Avro files are supported on Spark 2.4+ only") file_df_connection, source_path = local_fs_file_df_connection_with_path df = file_df_dataframe @@ -117,3 +125,116 @@ def test_avro_writer( assert read_df.count() assert read_df.schema == df.schema assert_equal_df(read_df, df, order_by="id") + + +@pytest.mark.parametrize("column_type", [str, col]) +def test_avro_serialize_and_parse_column( + spark, + local_fs_file_df_connection_with_path, + file_df_dataframe, + avro_schema, + column_type, +): + from pyspark.sql.functions import struct + from pyspark.sql.types import BinaryType + + spark_version = get_spark_version(spark) + if spark_version < Version("2.4"): + pytest.skip("Avro files are supported on Spark 2.4+ only") + + if spark_version.major < 3: + msg = ( + f"`Avro.parse_column` or `Avro.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=msg) + else: + context_manager = contextlib.nullcontext() + df = file_df_dataframe + avro = Avro(schema_dict=avro_schema) + + combined_df = df.withColumn("combined", struct([col(c) for c in df.columns])) + + with context_manager: + serialized_df = combined_df.select(avro.serialize_column(column_type("combined"))) + assert isinstance(serialized_df.schema["combined"].dataType, BinaryType) + parsed_df = serialized_df.select(avro.parse_column(column_type("combined"))) + assert combined_df.select("combined").collect() == parsed_df.collect() + + +@pytest.mark.parametrize("column_type", [str, col]) +def test_avro_serialize_and_parse_no_schema( + spark, + local_fs_file_df_connection_with_path, + file_df_dataframe, + column_type, +): + from pyspark.sql.functions import struct + from pyspark.sql.types import BinaryType + + spark_version = get_spark_version(spark) + if spark_version < Version("2.4"): + pytest.skip("Avro files are supported on Spark 2.4+ only") + + if spark_version.major < 3: + msg = ( + f"`Avro.parse_column` or `Avro.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=msg) + else: + context_manager = contextlib.nullcontext() + + df = file_df_dataframe + avro = Avro() + + with context_manager: + combined_df = df.withColumn("combined", struct([col(c) for c in df.columns])) + serialized_df = combined_df.select(avro.serialize_column(column_type("combined"))) + assert isinstance(serialized_df.schema["combined"].dataType, BinaryType) + + with pytest.raises( + ValueError, + match="Avro.parse_column can be used only with defined `schema_dict` or `schema_url`", + ): + serialized_df.select(avro.parse_column(column_type("combined"))) + + +@pytest.mark.parametrize("column_type", [str, col]) +@responses.activate +def test_avro_serialize_and_parse_with_schema_url( + spark, + local_fs_file_df_connection_with_path, + file_df_dataframe, + column_type, + avro_schema, +): + from pyspark.sql.functions import struct + from pyspark.sql.types import BinaryType + + spark_version = get_spark_version(spark) + if spark_version < Version("2.4"): + pytest.skip("Avro files are supported on Spark 2.4+ only") + + if spark_version.major < 3: + msg = ( + f"`Avro.parse_column` or `Avro.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=msg) + else: + context_manager = contextlib.nullcontext() + + # mocking the request to return a JSON schema + schema_url = "http://example.com/avro_schema" + responses.add(responses.GET, schema_url, json=avro_schema, status=200) + + df = file_df_dataframe + avro = Avro(schema_url=schema_url) + + combined_df = df.withColumn("combined", struct([col(c) for c in df.columns])) + with context_manager: + serialized_df = combined_df.select(avro.serialize_column(column_type("combined"))) + assert isinstance(serialized_df.schema["combined"].dataType, BinaryType) + parsed_df = serialized_df.select(avro.parse_column(column_type("combined"))) + assert combined_df.select("combined").collect() == parsed_df.collect() From 02e22fd779ebc644c67d5c472f5ffa417ae5c368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 26 Apr 2024 08:19:40 +0000 Subject: [PATCH 40/71] [DOP-13853] Update MongoDB package to 10.2.3 --- .github/workflows/data/clickhouse/matrix.yml | 6 ++--- docs/changelog/next_release/267.breaking.rst | 26 +++++++++++++++++++ .../clickhouse/prerequisites.rst | 2 +- .../db_connection/clickhouse/dialect.py | 8 +++--- tests/fixtures/processing/clickhouse.py | 2 +- .../test_strategy_increment_clickhouse.py | 25 ++++++++++++++++++ 6 files changed, 61 insertions(+), 8 deletions(-) create mode 100644 docs/changelog/next_release/267.breaking.rst diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 9c8c558ba..15c6bffd5 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -25,16 +25,16 @@ matrix: clickhouse-version: 23.6.1-alpine <<: *max full: - # the lowest supported Clickhouse version by JDBC driver + # Clickhouse version with proper DateTime > DateTime64 comparison - clickhouse-image: yandex/clickhouse-server - clickhouse-version: '20.7' + clickhouse-version: '21.1' <<: *min - clickhouse-image: clickhouse/clickhouse-server clickhouse-version: 23.6.1-alpine <<: *max nightly: - clickhouse-image: yandex/clickhouse-server - clickhouse-version: '20.7' + clickhouse-version: '21.1' <<: *min - clickhouse-image: clickhouse/clickhouse-server clickhouse-version: latest-alpine diff --git a/docs/changelog/next_release/267.breaking.rst b/docs/changelog/next_release/267.breaking.rst new file mode 100644 index 000000000..5ce301393 --- /dev/null +++ b/docs/changelog/next_release/267.breaking.rst @@ -0,0 +1,26 @@ +Serialize DateTimeHWM to Clickhouse's ``DateTime64(6)`` (precision up to microseconds) instead of ``DateTime`` (precision up to seconds). + +For Clickhouse below 21.1 comparing column of type ``DateTime`` with a value of type ``DateTime64`` was not supported, returning an empty dataframe. +To avoid this, replace: + +.. code:: python + + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="hwm_column", # <-- + ), + ) + +with: + +.. code:: python + + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="CAST(hwm_column AS DateTime64)", # <-- + ), + ) diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst index 654add047..f7ade0341 100644 --- a/docs/connection/db_connection/clickhouse/prerequisites.rst +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -6,7 +6,7 @@ Prerequisites Version Compatibility --------------------- -* Clickhouse server versions: 20.7 or higher +* Clickhouse server versions: 21.1 or higher * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/onetl/connection/db_connection/clickhouse/dialect.py b/onetl/connection/db_connection/clickhouse/dialect.py index 187b2e787..2c03620d3 100644 --- a/onetl/connection/db_connection/clickhouse/dialect.py +++ b/onetl/connection/db_connection/clickhouse/dialect.py @@ -26,9 +26,11 @@ def get_min_value(self, value: Any) -> str: return f"minOrNull({result})" def _serialize_datetime(self, value: datetime) -> str: - result = value.strftime("%Y-%m-%d %H:%M:%S") - return f"CAST('{result}' AS DateTime)" + # this requires at least Clickhouse 21.1, see: + # https://github.com/ClickHouse/ClickHouse/issues/16655 + result = value.strftime("%Y-%m-%d %H:%M:%S.%f") + return f"toDateTime64('{result}', 6)" def _serialize_date(self, value: date) -> str: result = value.strftime("%Y-%m-%d") - return f"CAST('{result}' AS Date)" + return f"toDate('{result}')" diff --git a/tests/fixtures/processing/clickhouse.py b/tests/fixtures/processing/clickhouse.py index 2b3e4cec1..bf0b2f3e7 100644 --- a/tests/fixtures/processing/clickhouse.py +++ b/tests/fixtures/processing/clickhouse.py @@ -20,7 +20,7 @@ class ClickhouseProcessing(BaseProcessing): "text_string": "String", "hwm_int": "Int32", "hwm_date": "Date", - "hwm_datetime": "DateTime", + "hwm_datetime": "DateTime64(6)", "float_value": "Float32", } diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 77ec071b3..67e67b065 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -302,12 +302,37 @@ def test_clickhouse_strategy_incremental_explicit_hwm_type( ColumnDateHWM, lambda x: x.isoformat(), ), + pytest.param( + "hwm_date", + "CAST(text_string AS Date32)", + ColumnDateHWM, + lambda x: x.isoformat(), + marks=pytest.mark.xfail(reason="Date32 type was added in ClickHouse 21.9"), + ), ( "hwm_datetime", "CAST(text_string AS DateTime)", ColumnDateTimeHWM, lambda x: x.isoformat(), ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime64)", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime64(3))", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime64(6))", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), ], ) def test_clickhouse_strategy_incremental_with_hwm_expr( From c8884c711844445e77a79d44732e3f83e6183f50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Sat, 27 Apr 2024 10:41:51 +0000 Subject: [PATCH 41/71] Remove Sphinx version limit --- requirements/docs.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements/docs.txt b/requirements/docs.txt index f86c2207b..3776dbb09 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -3,8 +3,7 @@ furo importlib-resources<6 numpydoc pygments-csv-lexer -# TODO: remove version limit after https://github.com/pradyunsg/furo/pull/783 -sphinx<7.3 +sphinx sphinx-copybutton sphinx-design sphinx-favicon From 88359433e83b416552c44625b589eaed1f22c806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Sat, 27 Apr 2024 10:48:24 +0000 Subject: [PATCH 42/71] Update pre-commit hooks --- .pre-commit-config.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a5aa7cb60..aa7bde988 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -95,8 +95,13 @@ repos: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] + - repo: https://github.com/asottile/add-trailing-comma + rev: v3.1.0 + hooks: + - id: add-trailing-comma + - repo: https://github.com/psf/black - rev: 24.4.0 + rev: 24.4.2 hooks: - id: black language_version: python3 @@ -106,7 +111,7 @@ repos: hooks: - id: blacken-docs additional_dependencies: - - black==24.3.0 + - black==24.4.2 - repo: https://github.com/pycqa/bandit rev: 1.7.8 From 4a2287cb18fe337e3850107f68929edcd1bd717c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Sat, 27 Apr 2024 10:52:44 +0000 Subject: [PATCH 43/71] Fix coverage report --- tests/.coveragerc | 1 + .../test_avro_integration.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/.coveragerc b/tests/.coveragerc index 5d7fb16f9..08633e6cc 100644 --- a/tests/.coveragerc +++ b/tests/.coveragerc @@ -17,6 +17,7 @@ exclude_lines = class .*\bProtocol\): @(abc\.)?abstractmethod if pyspark_version + if spark_version spark = SparkSession._instantiatedSession if log.isEnabledFor(logging.DEBUG): if sys.version_info diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py index dad5a730c..0792d8567 100644 --- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py @@ -130,7 +130,6 @@ def test_avro_writer( @pytest.mark.parametrize("column_type", [str, col]) def test_avro_serialize_and_parse_column( spark, - local_fs_file_df_connection_with_path, file_df_dataframe, avro_schema, column_type, @@ -150,6 +149,7 @@ def test_avro_serialize_and_parse_column( context_manager = pytest.raises(ValueError, match=msg) else: context_manager = contextlib.nullcontext() + df = file_df_dataframe avro = Avro(schema_dict=avro_schema) @@ -165,7 +165,6 @@ def test_avro_serialize_and_parse_column( @pytest.mark.parametrize("column_type", [str, col]) def test_avro_serialize_and_parse_no_schema( spark, - local_fs_file_df_connection_with_path, file_df_dataframe, column_type, ): @@ -193,18 +192,17 @@ def test_avro_serialize_and_parse_no_schema( serialized_df = combined_df.select(avro.serialize_column(column_type("combined"))) assert isinstance(serialized_df.schema["combined"].dataType, BinaryType) - with pytest.raises( - ValueError, - match="Avro.parse_column can be used only with defined `schema_dict` or `schema_url`", - ): - serialized_df.select(avro.parse_column(column_type("combined"))) + with pytest.raises( + ValueError, + match="Avro.parse_column can be used only with defined `schema_dict` or `schema_url`", + ): + serialized_df.select(avro.parse_column(column_type("combined"))) @pytest.mark.parametrize("column_type", [str, col]) @responses.activate def test_avro_serialize_and_parse_with_schema_url( spark, - local_fs_file_df_connection_with_path, file_df_dataframe, column_type, avro_schema, From c595e73c351bfd52be8f54d1ff32f9f88cfa7e4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Sat, 27 Apr 2024 11:00:38 +0000 Subject: [PATCH 44/71] Fix coverage report --- .../test_avro_integration.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py index 0792d8567..464727a2a 100644 --- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py @@ -192,11 +192,11 @@ def test_avro_serialize_and_parse_no_schema( serialized_df = combined_df.select(avro.serialize_column(column_type("combined"))) assert isinstance(serialized_df.schema["combined"].dataType, BinaryType) - with pytest.raises( - ValueError, - match="Avro.parse_column can be used only with defined `schema_dict` or `schema_url`", - ): - serialized_df.select(avro.parse_column(column_type("combined"))) + with pytest.raises( + ValueError, + match="Avro.parse_column can be used only with defined `schema_dict` or `schema_url`", + ): + serialized_df.select(avro.parse_column(column_type("combined"))) @pytest.mark.parametrize("column_type", [str, col]) From b8e26635f4fd075e0dc2d1de5c6645b0101afa84 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 06:51:20 +0000 Subject: [PATCH 45/71] Bump actions/checkout from 3 to 4 in the github-actions group Bumps the github-actions group with 1 update: [actions/checkout](https://github.com/actions/checkout). Updates `actions/checkout` from 3 to 4 - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/cache-cleanup.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cache-cleanup.yml b/.github/workflows/cache-cleanup.yml index 0f6b3fc19..b960a7492 100644 --- a/.github/workflows/cache-cleanup.yml +++ b/.github/workflows/cache-cleanup.yml @@ -16,7 +16,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Cleanup cache run: | From 4f55fdc4c670d5779a03c285f07c655a7fd59748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 26 Apr 2024 11:07:27 +0000 Subject: [PATCH 46/71] [DOP-15547] Update Docker images to latest versions --- .env.dependencies | 7 - .github/workflows/data/clickhouse/matrix.yml | 6 +- .github/workflows/data/core/matrix.yml | 2 +- .github/workflows/data/core/tracked.txt | 2 + .github/workflows/data/ftp/matrix.yml | 2 +- .github/workflows/data/ftps/matrix.yml | 2 +- .github/workflows/data/greenplum/matrix.yml | 4 +- .github/workflows/data/hdfs/matrix.yml | 2 +- .github/workflows/data/hive/matrix.yml | 2 +- .github/workflows/data/kafka/matrix.yml | 10 +- .github/workflows/data/local-fs/matrix.yml | 1 + .github/workflows/data/mongodb/matrix.yml | 6 +- .github/workflows/data/mssql/matrix.yml | 2 +- .github/workflows/data/mysql/matrix.yml | 14 +- .github/workflows/data/oracle/matrix.yml | 6 +- .github/workflows/data/postgres/matrix.yml | 8 +- .github/workflows/data/s3/matrix.yml | 4 +- .github/workflows/data/samba/matrix.yml | 1 + .github/workflows/data/sftp/matrix.yml | 6 +- .github/workflows/data/teradata/matrix.yml | 2 +- .github/workflows/data/webdav/matrix.yml | 2 +- .github/workflows/test-clickhouse.yml | 4 - .github/workflows/test-ftp.yml | 15 +- .github/workflows/test-ftps.yml | 15 +- .github/workflows/test-hdfs.yml | 11 +- .github/workflows/test-kafka.yml | 12 -- .github/workflows/test-mongodb.yml | 4 - .github/workflows/test-mssql.yml | 4 - .github/workflows/test-mysql.yml | 4 - .github/workflows/test-oracle.yml | 4 - .github/workflows/test-postgres.yml | 4 - .github/workflows/test-s3.yml | 4 - .github/workflows/test-samba.yml | 7 - .github/workflows/test-sftp.yml | 4 - .github/workflows/test-webdav.yml | 7 - CONTRIBUTING.rst | 4 +- README.rst | 12 +- docker-compose.yml | 4 +- docker/Dockerfile | 5 +- docker/wait-for-it.sh | 182 ------------------ .../{spark-3.3.3.txt => spark-3.3.4.txt} | 2 +- .../{spark-3.4.2.txt => spark-3.4.3.txt} | 2 +- requirements/tests/spark-3.5.1.txt | 5 + .../test_clickhouse_reader_integration.py | 7 +- .../test_clickhouse_integration.py | 2 +- .../test_strategy_increment_clickhouse.py | 2 +- 46 files changed, 78 insertions(+), 339 deletions(-) delete mode 100755 docker/wait-for-it.sh rename requirements/tests/{spark-3.3.3.txt => spark-3.3.4.txt} (80%) rename requirements/tests/{spark-3.4.2.txt => spark-3.4.3.txt} (76%) create mode 100644 requirements/tests/spark-3.5.1.txt diff --git a/.env.dependencies b/.env.dependencies index af892898b..ae70aa305 100644 --- a/.env.dependencies +++ b/.env.dependencies @@ -13,13 +13,6 @@ KAFKA_CFG_LISTENERS=INTERNAL_PLAINTEXT_ANONYMOUS://:9092,EXTERNAL_PLAINTEXT_ANON KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL_PLAINTEXT_ANONYMOUS://kafka:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://localhost:9093,INTERNAL_PLAINTEXT_SASL://kafka:9094,EXTERNAL_PLAINTEXT_SASL://localhost:9095 KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT KAFKA_CFG_SASL_ENABLED_MECHANISMS=PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 -# old config names for <1.1.1 -KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 -KAFKA_INTER_BROKER_LISTENER_NAME=INTERNAL_PLAINTEXT_ANONYMOUS -KAFKA_LISTENERS=INTERNAL_PLAINTEXT_ANONYMOUS://:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://:9093,INTERNAL_PLAINTEXT_SASL://:9094,EXTERNAL_PLAINTEXT_SASL://:9095 -KAFKA_ADVERTISED_LISTENERS=INTERNAL_PLAINTEXT_ANONYMOUS://kafka:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://localhost:9093,INTERNAL_PLAINTEXT_SASL://kafka:9094,EXTERNAL_PLAINTEXT_SASL://localhost:9095 -KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT -KAFKA_SASL_ENABLED_MECHANISMS=PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 # Mongo MONGO_INITDB_ROOT_USERNAME=onetl diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 15c6bffd5..1469100a3 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -22,7 +22,7 @@ latest: &latest matrix: small: - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 23.6.1-alpine + clickhouse-version: 24.3.2.23-alpine <<: *max full: # Clickhouse version with proper DateTime > DateTime64 comparison @@ -30,7 +30,7 @@ matrix: clickhouse-version: '21.1' <<: *min - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 23.6.1-alpine + clickhouse-version: 24.3.2.23-alpine <<: *max nightly: - clickhouse-image: yandex/clickhouse-server diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index a7339e139..d20f074ab 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/core/tracked.txt b/.github/workflows/data/core/tracked.txt index da678a6a1..5b2a3ca4d 100644 --- a/.github/workflows/data/core/tracked.txt +++ b/.github/workflows/data/core/tracked.txt @@ -2,5 +2,7 @@ onetl/hooks/** onetl/plugins/** onetl/impl/** onetl/hwm/** +onetl/_util/** onetl/_internal.py onetl/log.py +.github/workflows/data/core/** diff --git a/.github/workflows/data/ftp/matrix.yml b/.github/workflows/data/ftp/matrix.yml index 49468d914..d01c39029 100644 --- a/.github/workflows/data/ftp/matrix.yml +++ b/.github/workflows/data/ftp/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/ftps image has only latest tag + # chonjay21/ftps image has only latest tag - ftp-version: latest <<: *max full: diff --git a/.github/workflows/data/ftps/matrix.yml b/.github/workflows/data/ftps/matrix.yml index ec5a862cd..efe28e79a 100644 --- a/.github/workflows/data/ftps/matrix.yml +++ b/.github/workflows/data/ftps/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/ftps image has only latest tag + # chonjay21/ftps image has only latest tag - ftps-version: latest <<: *max full: diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 2b66c0e19..28ec20e75 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -28,14 +28,14 @@ matrix: package-version: 2.3.1 <<: *max full: - - greenplum-version: 6.25.3 + - greenplum-version: 6.23.1 package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 package-version: 2.3.1 <<: *max nightly: - - greenplum-version: 6.25.3 + - greenplum-version: 6.23.1 package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index e62f0242a..6d8156c50 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max hadoop-version: hadoop3-hdfs - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 0f7d4ba6b..6ce0d7a8e 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 29e587721..8050948b7 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -1,6 +1,8 @@ min: &min - # kafka_version: 0.10.2-1-r3 - kafka-version: 3.5.1 + # Headers are supported only since 2.x. + # Images before 3.2.3 are not creating kafka_jaas.conf properly, and failing to start + # https://github.com/bitnami/containers/blob/9db9064668365cac89bff58259f63eb78bb97e79/bitnami/kafka/README.md?plain=1#L933 + kafka-version: 3.2.3 pydantic-version: 1 spark-version: 2.4.8 python-version: '3.7' @@ -8,9 +10,9 @@ min: &min os: ubuntu-latest max: &max - kafka-version: 3.5.1 + kafka-version: 3.7.0 pydantic-version: 2 - spark-version: 3.5.0 + spark-version: 3.5.1 python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index b3db2391f..d1337291e 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -20,6 +20,7 @@ min_excel: &min_excel os: ubuntu-latest max: &max + # Excel package currently has no release for 3.5.1 spark-version: 3.5.0 pydantic-version: 2 python-version: '3.12' diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index c916cc306..68c19956d 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -7,7 +7,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.4.2 + spark-version: 3.4.3 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -22,12 +22,12 @@ latest: &latest matrix: small: - - mongodb-version: 6.0.7 + - mongodb-version: 7.0.8 <<: *max full: - mongodb-version: 4.0.0 <<: *min - - mongodb-version: 6.0.7 + - mongodb-version: 7.0.8 <<: *max nightly: - mongodb-version: 4.0.0 diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index 0138805bb..19ba2f3e3 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index 9b64e3b93..cd96a63b9 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -21,17 +21,17 @@ latest: &latest matrix: small: - - mysql-version: 8.0.33 + - mysql-version: 8.3.0 <<: *max full: - # Min supported version by JDBC driver is 5.7 - - mysql-version: 5.7.42 + # Min supported version by JDBC driver is 5.7 + - mysql-version: 5.7.6 <<: *min - # Max supported version by JDBC driver is 8.0 - - mysql-version: 8.0.33 + # Max supported version by JDBC driver is 8.3 + - mysql-version: 8.3.0 <<: *max nightly: - - mysql-version: 5.7.42 + - mysql-version: 5.7.6 <<: *min - mysql-version: latest <<: *latest diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index 55dc4c185..c0a50fc2b 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -30,10 +30,6 @@ matrix: oracle-version: 11.2.0.2-slim-faststart db-name: XE <<: *min - - oracle-image: gvenzl/oracle-xe - oracle-version: 21.3.0-slim-faststart - db-name: XEPDB1 - <<: *max - oracle-image: gvenzl/oracle-free oracle-version: 23.3-slim-faststart db-name: FREEPDB1 diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 8cdff4f63..7b8e296e5 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -21,13 +21,13 @@ latest: &latest matrix: small: - - postgres-version: 15.2-alpine + - postgres-version: 16.2-alpine <<: *max full: - # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life + # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life - postgres-version: 9.4.26-alpine <<: *min - - postgres-version: 15.2-alpine + - postgres-version: 16.2-alpine <<: *max nightly: - postgres-version: 9.4.26-alpine diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index a0825603b..d9b9338f8 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -9,8 +9,8 @@ min: &min os: ubuntu-latest max: &max - minio-version: 2023.7.18 - spark-version: 3.5.0 + minio-version: 2024.4.18 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index 5b0b2628e..b1e6b56da 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -15,6 +15,7 @@ latest: &latest matrix: small: + # elswork/samba image versions does not correlate with smbd version, it is always 4.x - server-version: latest <<: *max full: diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 0dfd9e730..a32f6f823 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -15,13 +15,13 @@ latest: &latest matrix: small: - - openssh-version: 9.3_p1-r3-ls120 + - openssh-version: 9.6_p1-r0-ls154 <<: *max full: - # prior image versions does not accept incoming connections, seems like a bug + # prior image versions does not accept incoming connections, seems like a bug - openssh-version: 8.1_p1-r0-ls5 <<: *min - - openssh-version: 9.3_p1-r3-ls120 + - openssh-version: 9.6_p1-r0-ls154 <<: *max nightly: - openssh-version: 8.1_p1-r0-ls5 diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index 9647daec6..6c2a55455 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,5 +1,5 @@ max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index 8d8f012a7..fb76e3282 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/webdav image has only latest tag + # chonjay21/webdav image has only latest tag - webdav-version: latest <<: *max full: diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index 6c790cbc5..4f8d436ec 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Clickhouse to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 8123 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index 4e947d738..e41e1f3eb 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -50,20 +50,15 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start FTP run: | docker compose down -v --remove-orphans - docker compose up -d ftp + docker compose up -d ftp --wait --wait --wait-timeout 200 env: FTP_IMAGE: chonjay21/ftps:${{ inputs.ftp-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} - - - name: Wait for FTP to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2121 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index 19cce458c..4fb9c6234 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -50,20 +50,15 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start FTPS run: | docker compose down -v --remove-orphans - docker compose up -d ftps + docker compose up -d ftps --wait --wait --wait-timeout 200 env: FTPS_IMAGE: chonjay21/ftps:${{ inputs.ftps-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} - - - name: Wait for FTPS to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2122 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index 918e4f091..6e52a5df1 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -70,8 +70,8 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start HDFS run: | docker compose down -v --remove-orphans @@ -81,11 +81,6 @@ jobs: wait $wait_pid env: HDFS_IMAGE: mtsrus/hadoop:${{ inputs.hadoop-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} - - - name: Wait for HDFS to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9870 -t 60 - name: Run tests run: | @@ -99,8 +94,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index 120ac3a40..34c2894a9 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -58,13 +58,6 @@ jobs: KAFKA_CFG_ADVERTISED_LISTENERS: INTERNAL_PLAINTEXT_ANONYMOUS://kafka:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://localhost:9093,INTERNAL_PLAINTEXT_SASL://kafka:9094,EXTERNAL_PLAINTEXT_SASL://localhost:9095 KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT KAFKA_CFG_SASL_ENABLED_MECHANISMS: PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 - # old config names for <1.1.1 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL_PLAINTEXT_ANONYMOUS - KAFKA_LISTENERS: INTERNAL_PLAINTEXT_ANONYMOUS://:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://:9093,INTERNAL_PLAINTEXT_SASL://:9094,EXTERNAL_PLAINTEXT_SASL://:9095 - KAFKA_ADVERTISED_LISTENERS: INTERNAL_PLAINTEXT_ANONYMOUS://kafka:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://localhost:9093,INTERNAL_PLAINTEXT_SASL://kafka:9094,EXTERNAL_PLAINTEXT_SASL://localhost:9095 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT - KAFKA_SASL_ENABLED_MECHANISMS: PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 ports: - 9093:9093 - 9095:9095 @@ -116,11 +109,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Kafka to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9093 -t 60 - ./docker/wait-for-it.sh -h localhost -p 9095 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index ea230132f..a617450b6 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -81,10 +81,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MongoDB to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 27017 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 23d315a93..1d5ebb853 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -84,10 +84,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MSSQL to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 1433 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index 66bda2e10..e2035cfc7 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MySQL to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 3306 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index 2438fce1d..e11a57b84 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -98,10 +98,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Oracle to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 1522 -t 60 - - name: Run tests run: | export ONETL_ORA_CLIENT_PATH=./oracle/instantclient_21_10 diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index 87fd34731..ef31a0375 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -82,10 +82,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Postgres to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 5432 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index 96775f3bf..8da4540cd 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for S3 to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9010 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index 3a7c1c921..58db08b88 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -57,11 +57,6 @@ jobs: docker compose up -d samba env: SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} - - - name: Wait for Samba to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 445 -t 60 - name: Run tests run: | @@ -74,8 +69,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index 569d580f7..ffbf786f2 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -60,10 +60,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for SFTP to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2222 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index ee23f0ae8..472519643 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -59,11 +59,6 @@ jobs: docker compose up -d webdav env: WEBDAV_IMAGE: chonjay21/webdav:${{ inputs.webdav-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} - - - name: Wait for WebDAV to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 8000 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 3d132ddb5..e7a60fc13 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -11,7 +11,7 @@ Limitations We should keep close to these items during development: -* Some companies still use old Spark versions, like 2.3.0. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. +* Some companies still use old Spark versions, like 2.3.1. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. * Different users uses onETL in different ways - some uses only DB connectors, some only files. Connector-specific dependencies should be optional. * Instead of creating classes with a lot of different options, prefer splitting them into smaller classes, e.g. options class, context manager, etc, and using composition. @@ -71,7 +71,7 @@ Create virtualenv and install dependencies: -r requirements/tests/postgres.txt \ -r requirements/tests/oracle.txt \ -r requirements/tests/pydantic-2.txt \ - -r requirements/tests/spark-3.5.0.txt + -r requirements/tests/spark-3.5.1.txt # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 pip install sphinx-plantuml --no-deps diff --git a/README.rst b/README.rst index 625112b2c..da0b84cb6 100644 --- a/README.rst +++ b/README.rst @@ -187,17 +187,17 @@ Compatibility matrix +--------------------------------------------------------------+-------------+-------------+-------+ | Spark | Python | Java | Scala | +==============================================================+=============+=============+=======+ -| `2.3.x `_ | 3.7 only | 8 only | 2.11 | +| `2.3.x `_ | 3.7 only | 8 only | 2.11 | +--------------------------------------------------------------+-------------+-------------+-------+ | `2.4.x `_ | 3.7 only | 8 only | 2.11 | +--------------------------------------------------------------+-------------+-------------+-------+ | `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +| `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ .. _pyspark-install: @@ -212,7 +212,7 @@ or install PySpark explicitly: .. code:: bash - pip install onetl pyspark==3.5.0 # install a specific PySpark version + pip install onetl pyspark==3.5.1 # install a specific PySpark version or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance. **Otherwise connection object cannot be created.** @@ -553,7 +553,7 @@ Read files directly from S3 path, convert them to dataframe, transform it and th setup_logging() # Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.0") + Postgres.get_packages() + maven_packages = SparkS3.get_packages(spark_version="3.5.1") + Postgres.get_packages() spark = ( SparkSession.builder.appName("spark_app_onetl_demo") .config("spark.jars.packages", ",".join(maven_packages)) diff --git a/docker-compose.yml b/docker-compose.yml index 6ba2aca64..54b2af91d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: context: . target: base args: - SPARK_VERSION: 3.5.0 + SPARK_VERSION: 3.5.1 env_file: .env.docker volumes: - ./:/app/ @@ -173,7 +173,7 @@ services: - onetl samba: - image: elswork/samba + image: ${SAMBA_IMAGE:-elswork/samba} restart: unless-stopped ports: - "139:139" diff --git a/docker/Dockerfile b/docker/Dockerfile index 36cbb129f..d3d34ef21 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -42,10 +42,9 @@ USER onetl ENV PATH=${ONETL_USER_HOME}/.local/bin:${PATH} COPY --chown=onetl:onetl ./run_tests.sh ./pytest_runner.sh ./combine_coverage.sh /app/ -COPY --chown=onetl:onetl ./docker/wait-for-it.sh /app/docker/wait-for-it.sh -RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh /app/docker/wait-for-it.sh +RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh -ARG SPARK_VERSION=3.5.0 +ARG SPARK_VERSION=3.5.1 # Spark is heavy, and version change is quite rare COPY --chown=onetl:onetl ./requirements/tests/spark-${SPARK_VERSION}.txt /app/requirements/tests/ RUN pip install -r /app/requirements/tests/spark-${SPARK_VERSION}.txt diff --git a/docker/wait-for-it.sh b/docker/wait-for-it.sh deleted file mode 100755 index 7410fa3a6..000000000 --- a/docker/wait-for-it.sh +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env bash -# Use this script to test if a given TCP host/port are available - -WAITFORIT_cmdname=${0##*/} - -echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } - -usage() -{ - cat << USAGE >&2 -Usage: - $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] - -h HOST | --host=HOST Host or IP under test - -p PORT | --port=PORT TCP port under test - Alternatively, you specify the host and port as host:port - -s | --strict Only execute subcommand if the test succeeds - -q | --quiet Don't output any status messages - -t TIMEOUT | --timeout=TIMEOUT - Timeout in seconds, zero for no timeout - -- COMMAND ARGS Execute command with args after the test finishes -USAGE - exit 1 -} - -wait_for() -{ - if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then - echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" - else - echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" - fi - WAITFORIT_start_ts=$(date +%s) - while : - do - if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then - nc -z $WAITFORIT_HOST $WAITFORIT_PORT - WAITFORIT_result=$? - else - (echo > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 - WAITFORIT_result=$? - fi - if [[ $WAITFORIT_result -eq 0 ]]; then - WAITFORIT_end_ts=$(date +%s) - echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" - break - fi - sleep 1 - done - return $WAITFORIT_result -} - -wait_for_wrapper() -{ - # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 - if [[ $WAITFORIT_QUIET -eq 1 ]]; then - timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & - else - timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & - fi - WAITFORIT_PID=$! - trap "kill -INT -$WAITFORIT_PID" INT - wait $WAITFORIT_PID - WAITFORIT_RESULT=$? - if [[ $WAITFORIT_RESULT -ne 0 ]]; then - echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" - fi - return $WAITFORIT_RESULT -} - -# process arguments -while [[ $# -gt 0 ]] -do - case "$1" in - *:* ) - WAITFORIT_hostport=(${1//:/ }) - WAITFORIT_HOST=${WAITFORIT_hostport[0]} - WAITFORIT_PORT=${WAITFORIT_hostport[1]} - shift 1 - ;; - --child) - WAITFORIT_CHILD=1 - shift 1 - ;; - -q | --quiet) - WAITFORIT_QUIET=1 - shift 1 - ;; - -s | --strict) - WAITFORIT_STRICT=1 - shift 1 - ;; - -h) - WAITFORIT_HOST="$2" - if [[ $WAITFORIT_HOST == "" ]]; then break; fi - shift 2 - ;; - --host=*) - WAITFORIT_HOST="${1#*=}" - shift 1 - ;; - -p) - WAITFORIT_PORT="$2" - if [[ $WAITFORIT_PORT == "" ]]; then break; fi - shift 2 - ;; - --port=*) - WAITFORIT_PORT="${1#*=}" - shift 1 - ;; - -t) - WAITFORIT_TIMEOUT="$2" - if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi - shift 2 - ;; - --timeout=*) - WAITFORIT_TIMEOUT="${1#*=}" - shift 1 - ;; - --) - shift - WAITFORIT_CLI=("$@") - break - ;; - --help) - usage - ;; - *) - echoerr "Unknown argument: $1" - usage - ;; - esac -done - -if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then - echoerr "Error: you need to provide a host and port to test." - usage -fi - -WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} -WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} -WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} -WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} - -# Check to see if timeout is from busybox? -WAITFORIT_TIMEOUT_PATH=$(type -p timeout) -WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) - -WAITFORIT_BUSYTIMEFLAG="" -if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then - WAITFORIT_ISBUSY=1 - # Check if busybox timeout uses -t flag - # (recent Alpine versions don't support -t anymore) - if timeout &>/dev/stdout | grep -q -e '-t '; then - WAITFORIT_BUSYTIMEFLAG="-t" - fi -else - WAITFORIT_ISBUSY=0 -fi - -if [[ $WAITFORIT_CHILD -gt 0 ]]; then - wait_for - WAITFORIT_RESULT=$? - exit $WAITFORIT_RESULT -else - if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then - wait_for_wrapper - WAITFORIT_RESULT=$? - else - wait_for - WAITFORIT_RESULT=$? - fi -fi - -if [[ $WAITFORIT_CLI != "" ]]; then - if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then - echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" - exit $WAITFORIT_RESULT - fi - exec "${WAITFORIT_CLI[@]}" -else - exit $WAITFORIT_RESULT -fi diff --git a/requirements/tests/spark-3.3.3.txt b/requirements/tests/spark-3.3.4.txt similarity index 80% rename from requirements/tests/spark-3.3.3.txt rename to requirements/tests/spark-3.3.4.txt index 259340bf6..55629ed65 100644 --- a/requirements/tests/spark-3.3.3.txt +++ b/requirements/tests/spark-3.3.4.txt @@ -1,5 +1,5 @@ numpy>=1.16,<1.24 pandas>=1.0,<2 pyarrow>=1.0 -pyspark==3.3.3 +pyspark==3.3.4 sqlalchemy<2.0 diff --git a/requirements/tests/spark-3.4.2.txt b/requirements/tests/spark-3.4.3.txt similarity index 76% rename from requirements/tests/spark-3.4.2.txt rename to requirements/tests/spark-3.4.3.txt index c7173637d..5ea738d58 100644 --- a/requirements/tests/spark-3.4.2.txt +++ b/requirements/tests/spark-3.4.3.txt @@ -1,5 +1,5 @@ numpy>=1.16 pandas>=1.0 pyarrow>=1.0 -pyspark==3.4.2 +pyspark==3.4.3 sqlalchemy diff --git a/requirements/tests/spark-3.5.1.txt b/requirements/tests/spark-3.5.1.txt new file mode 100644 index 000000000..d1e812f7a --- /dev/null +++ b/requirements/tests/spark-3.5.1.txt @@ -0,0 +1,5 @@ +numpy>=1.16 +pandas>=1.0 +pyarrow>=1.0 +pyspark==3.5.1 +sqlalchemy diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py index e38de7413..72314b5b3 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py @@ -155,6 +155,7 @@ def test_clickhouse_reader_snapshot_with_columns(spark, processing, load_table_d assert count_df.collect()[0][0] == table_df.count() +@pytest.mark.xfail(reason="Clickhouse <24 deduplicated column names, but 24+ does not") def test_clickhouse_reader_snapshot_with_columns_duplicated(spark, processing, prepare_schema_table): clickhouse = Clickhouse( host=processing.host, @@ -180,9 +181,9 @@ def test_clickhouse_reader_snapshot_with_columns_duplicated(spark, processing, p ], ) - # Clickhouse can detect that column is already a part of * and does not produce duplicates - df2 = reader2.run() - assert df1.columns == df2.columns + with pytest.raises(Exception, match="The column `id_int` already exists"): + df2 = reader2.run() + assert df1.columns == df2.columns def test_clickhouse_reader_snapshot_with_columns_mixed_naming(spark, processing, get_schema_table): diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py index c786b1fe0..18047d749 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py @@ -250,7 +250,7 @@ def table_finalizer(): assert not clickhouse.fetch(f"SELECT * FROM {temp_table}{suffix}").count() -@pytest.mark.xfail(reason="Clickhouse 20.7 doesn't support functions") +@pytest.mark.xfail(reason="CREATE FUNCTION is not supported in Clickhouse < 21.20") @pytest.mark.parametrize("suffix", ["", ";"]) def test_clickhouse_connection_execute_function( request, diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 67e67b065..0e4fb46ef 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -212,7 +212,7 @@ def test_clickhouse_strategy_incremental_nothing_to_read(spark, processing, prep [ ("float_value", ValueError, "Expression 'float_value' returned values"), ("text_string", RuntimeError, "Cannot detect HWM type for"), - ("unknown_column", Exception, "Missing columns"), + ("unknown_column", Exception, "(Missing columns|Unknown expression).*"), ], ) def test_clickhouse_strategy_incremental_wrong_hwm( From 10ab3703e4d1f971fe15d9ebc7d966d37be156cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 26 Apr 2024 11:30:28 +0000 Subject: [PATCH 47/71] [DOP-15547] Use official MSSQL docker image --- .env.dependencies | 5 +--- .github/workflows/data/mssql/matrix.yml | 11 ++++---- .github/workflows/test-mssql.yml | 26 +++++++++--------- .pre-commit-config.yaml | 2 +- docker-compose.yml | 5 +++- docker/mssql/configure-db.sh | 36 +++++++++++++++++++++++++ docker/mssql/entrypoint.sh | 7 +++++ docker/mssql/setup.sql | 20 ++++++++++++++ 8 files changed, 88 insertions(+), 24 deletions(-) create mode 100755 docker/mssql/configure-db.sh create mode 100755 docker/mssql/entrypoint.sh create mode 100644 docker/mssql/setup.sql diff --git a/.env.dependencies b/.env.dependencies index ae70aa305..5fccfa159 100644 --- a/.env.dependencies +++ b/.env.dependencies @@ -19,11 +19,8 @@ MONGO_INITDB_ROOT_USERNAME=onetl MONGO_INITDB_ROOT_PASSWORD=E4j7h!9A # MSSQL -MSSQL_DB=onetl -MSSQL_USER=onetl -MSSQL_PASSWORD=7ellowEl7akey ACCEPT_EULA=Y -SA_PASSWORD=2astazeY +MSSQL_SA_PASSWORD=2astazeY # MySQL MYSQL_ROOT_PASSWORD=ohbuz9Eochaj9saibooK3thooGa5aesh diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index 19ba2f3e3..c46d98d03 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -21,16 +21,15 @@ latest: &latest matrix: small: - - mssql-version: v2017.CU24.0 + - mssql-version: 2022-CU12-ubuntu-22.04 <<: *max full: - - mssql-version: v2017.CU24.0 + - mssql-version: 2017-GA-ubuntu <<: *min - # v2019.CU4.0 is not very stable - - mssql-version: v2017.CU24.0 + - mssql-version: 2022-CU12-ubuntu-22.04 <<: *max nightly: - - mssql-version: v2017.CU24.0 + - mssql-version: 2017-GA-ubuntu <<: *min - - mssql-version: v2017.CU24.0 + - mssql-version: latest <<: *latest diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 1d5ebb853..0819887aa 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -29,18 +29,6 @@ jobs: test-mssql: name: Run MSSQL tests (server=${{ inputs.mssql-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} - services: - mssql: - image: mcmoe/mssqldocker:${{ inputs.mssql-version }} - env: - TZ: UTC - MSSQL_DB: onetl - MSSQL_USER: onetl - MSSQL_PASSWORD: 7ellowEl7akey - ACCEPT_EULA: Y - SA_PASSWORD: 2astazeY - ports: - - 1433:1433 steps: - name: Checkout code @@ -84,6 +72,15 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 + - name: Start MSSQL + run: | + docker compose down -v --remove-orphans + docker compose up -d mssql --wait --wait --wait-timeout 200 + env: + MSSQL_IMAGE: mcr.microsoft.com/mssql/server:${{ inputs.mssql-version }} + - name: Run tests run: | mkdir reports/ || echo "Directory exists" @@ -91,6 +88,11 @@ jobs: source ./env ./pytest_runner.sh -m mssql + - name: Shutdown MSSQL + if: always() + run: | + docker compose down -v --remove-orphans + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aa7bde988..b0745931b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ default_language_version: - python: python3.11 + python: python3.12 repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/docker-compose.yml b/docker-compose.yml index 54b2af91d..ae90e61cb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -83,11 +83,14 @@ services: - onetl mssql: - image: ${MSSQL_IMAGE:-mcmoe/mssqldocker:latest} + image: ${MSSQL_IMAGE:-mcr.microsoft.com/mssql/server:latest} restart: unless-stopped env_file: .env.dependencies ports: - 1433:1433 + volumes: + - ./docker/mssql/:/usr/config/ + entrypoint: ["/usr/config/entrypoint.sh"] networks: - onetl platform: linux/amd64 diff --git a/docker/mssql/configure-db.sh b/docker/mssql/configure-db.sh new file mode 100755 index 000000000..51b39ae32 --- /dev/null +++ b/docker/mssql/configure-db.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -o pipefail + +# Wait 60 seconds for SQL Server to start up by ensuring that +# calling SQLCMD does not return an error code, which will ensure that sqlcmd is accessible +# and that system and user databases return "0" which means all databases are in an "online" state +# https://docs.microsoft.com/en-us/sql/relational-databases/system-catalog-views/sys-databases-transact-sql?view=sql-server-2017 + +declare DBSTATUS +declare ERRCODE +TIMEOUT=60 +START=$(date +%s) +echo "Configure DB script started at $(date)" + +while true; do + DELTA=$(($(date +%s) - START)) + if [[ $DELTA -gt $TIMEOUT ]]; then + echo "ERROR: SQL Server took more than ${TIMEOUT} seconds to START up or one or more databases are not in an ONLINE state" + exit 1 + fi + + DBSTATUS=$(/opt/mssql-tools/bin/sqlcmd -h -1 -t 1 -U sa -P ${MSSQL_SA_PASSWORD} -Q "SET NOCOUNT ON; Select SUM(state) from sys.databases" 2>/dev/null | sed -e 's/^[[:space:]]*//') + ERRCODE=$? + if [[ "$DBSTATUS" -eq "0" && "$ERRCODE" -eq "0" ]]; then + echo "INFO: Database ready." + break + else + echo "INFO: Waiting for database to be ready..." + sleep 1 + fi +done + +# Run the setup script to create the DB and the schema in the DB +echo "Running setup.sql"; +/opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P $MSSQL_SA_PASSWORD -d master -i /usr/config/setup.sql; +echo "Success"; diff --git a/docker/mssql/entrypoint.sh b/docker/mssql/entrypoint.sh new file mode 100755 index 000000000..4f274086b --- /dev/null +++ b/docker/mssql/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# Start the script to create the DB and user +/usr/config/configure-db.sh & + +# Start SQL Server +/opt/mssql/bin/sqlservr diff --git a/docker/mssql/setup.sql b/docker/mssql/setup.sql new file mode 100644 index 000000000..0cf617237 --- /dev/null +++ b/docker/mssql/setup.sql @@ -0,0 +1,20 @@ +/* + +Enter custom T-SQL here that would run after SQL Server has started up. + +*/ + +CREATE DATABASE onetl; +GO + +USE onetl; +GO + +CREATE LOGIN onetl WITH PASSWORD = '7ellowEl7akey'; +GO + +CREATE USER onetl FOR LOGIN onetl; +GO + +GRANT CONTROL ON DATABASE::onetl TO onetl; +GO From 5417dc55d1133cb50729a7f6f16b702b95e74d1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 26 Apr 2024 14:41:12 +0000 Subject: [PATCH 48/71] [DOP-15564] Avoid urlencoding JDBC params --- docker-compose.yml | 2 + docs/changelog/next_release/268.feature.rst | 1 + .../db_connection/clickhouse/connection.py | 13 ++-- .../db_connection/greenplum/connection.py | 21 +++--- .../jdbc_connection/connection.py | 40 ++---------- .../db_connection/jdbc_mixin/connection.py | 27 ++++---- .../db_connection/mssql/connection.py | 11 ++-- .../db_connection/mysql/connection.py | 15 +++-- .../db_connection/oracle/connection.py | 13 ++-- .../db_connection/postgres/connection.py | 15 +++-- .../db_connection/teradata/connection.py | 21 ++++-- .../test_clickhouse_integration.py | 15 +++++ .../test_greenplum_integration.py | 15 +++++ .../test_mssql_integration.py | 15 +++++ .../test_mysql_integration.py | 15 +++++ .../test_oracle_integration.py | 16 +++++ .../test_clickhouse_unit.py | 29 ++++++++- .../test_greenplum_unit.py | 33 ++++++++-- .../test_jdbc_options_unit.py | 64 ------------------- .../test_mssql_unit.py | 33 ++++++++-- .../test_mysql_unit.py | 55 +++++++++++++--- .../test_oracle_unit.py | 28 +++++++- .../test_postgres_unit.py | 46 +++++++++++-- .../test_teradata_unit.py | 34 +++++++++- 24 files changed, 395 insertions(+), 182 deletions(-) create mode 100644 docs/changelog/next_release/268.feature.rst diff --git a/docker-compose.yml b/docker-compose.yml index ae90e61cb..3a61170e0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -31,6 +31,8 @@ services: - 5433:5432 networks: - onetl + sysctls: + - net.ipv6.conf.all.disable_ipv6=1 clickhouse: image: ${CLICKHOUSE_IMAGE:-clickhouse/clickhouse-server:latest-alpine} diff --git a/docs/changelog/next_release/268.feature.rst b/docs/changelog/next_release/268.feature.rst new file mode 100644 index 000000000..0938462ed --- /dev/null +++ b/docs/changelog/next_release/268.feature.rst @@ -0,0 +1 @@ +Allow passing JDBC connection extra params without urlencode. diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 2f22de9bf..89b7ff463 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -162,13 +162,16 @@ def package(self) -> str: @property def jdbc_url(self) -> str: - extra = self.extra.dict(by_alias=True) - parameters = "&".join(f"{k}={v}" for k, v in sorted(extra.items())) - if self.database: - return f"jdbc:clickhouse://{self.host}:{self.port}/{self.database}?{parameters}".rstrip("?") + return f"jdbc:clickhouse://{self.host}:{self.port}/{self.database}" + + return f"jdbc:clickhouse://{self.host}:{self.port}" - return f"jdbc:clickhouse://{self.host}:{self.port}?{parameters}".rstrip("?") + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + return result @staticmethod def _build_statement( diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index c3e6b3b9f..120d58008 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -250,15 +250,20 @@ def instance_url(self) -> str: @property def jdbc_url(self) -> str: - extra = { - key: value - for key, value in self.extra.dict(by_alias=True).items() - if not (key.startswith("server.") or key.startswith("pool.")) - } - extra["ApplicationName"] = extra.get("ApplicationName", self.spark.sparkContext.appName) + return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}" - parameters = "&".join(f"{k}={v}" for k, v in sorted(extra.items())) - return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}?{parameters}".rstrip("?") + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update( + { + key: value + for key, value in self.extra.dict(by_alias=True).items() + if not (key.startswith("server.") or key.startswith("pool.")) + }, + ) + result["ApplicationName"] = result.get("ApplicationName", self.spark.sparkContext.appName) + return result @slot def read_source_as_df( diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 3133d3671..616e5fd29 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -165,7 +165,7 @@ def write_df_to_target( options: JDBCWriteOptions | None = None, ) -> None: write_options = self.WriteOptions.parse(options) - jdbc_params = self.options_to_jdbc_params(write_options) + jdbc_properties = self._get_jdbc_properties(write_options, exclude={"if_exists"}, exclude_none=True) mode = ( "overwrite" @@ -173,7 +173,7 @@ def write_df_to_target( else write_options.if_exists.value ) log.info("|%s| Saving data to a table %r", self.__class__.__name__, target) - df.write.jdbc(table=target, mode=mode, **jdbc_params) + df.write.format("jdbc").mode(mode).options(dbtable=target, **jdbc_properties).save() log.info("|%s| Table %r successfully written", self.__class__.__name__, target) @slot @@ -196,38 +196,6 @@ def get_df_schema( return df.schema - def options_to_jdbc_params( - self, - options: JDBCReadOptions | JDBCWriteOptions, - ) -> dict: - # Have to replace the parameter with - # since the method takes the named parameter - # link to source below - # https://github.com/apache/spark/blob/2ef8ced27a6b0170a691722a855d3886e079f037/python/pyspark/sql/readwriter.py#L465 - - partition_column = getattr(options, "partition_column", None) - if partition_column: - options = options.copy( - update={"column": partition_column}, - exclude={"partition_column"}, - ) - - result = self._get_jdbc_properties( - options, - include=READ_TOP_LEVEL_OPTIONS | WRITE_TOP_LEVEL_OPTIONS, - exclude={"if_exists"}, - exclude_none=True, - ) - - result["properties"] = self._get_jdbc_properties( - options, - exclude=READ_TOP_LEVEL_OPTIONS | WRITE_TOP_LEVEL_OPTIONS | {"if_exists"}, - exclude_none=True, - ) - - result["properties"].pop("partitioningMode", None) - return result - @slot def get_min_max_values( self, @@ -275,8 +243,8 @@ def _query_on_executor( query: str, options: JDBCReadOptions, ) -> DataFrame: - jdbc_params = self.options_to_jdbc_params(options) - return self.spark.read.jdbc(table=f"({query}) T", **jdbc_params) + jdbc_properties = self._get_jdbc_properties(options, exclude={"partitioning_mode"}, exclude_none=True) + return self.spark.read.format("jdbc").options(dbtable=f"({query}) T", **jdbc_properties).load() def _exclude_partition_options( self, diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 856d387cf..dae2242b5 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -76,6 +76,16 @@ class JDBCMixin(FrozenModel): def jdbc_url(self) -> str: """JDBC Connection URL""" + @property + def jdbc_params(self) -> dict: + """JDBC Connection params""" + return { + "user": self.user, + "password": self.password.get_secret_value() if self.password is not None else "", + "driver": self.DRIVER, + "url": self.jdbc_url, + } + @slot def close(self): """ @@ -312,20 +322,12 @@ def _get_jdbc_properties( self, options: JDBCMixinOptions, **kwargs, - ) -> dict: + ) -> dict[str, str]: """ Fills up human-readable Options class to a format required by Spark internal methods """ - - result = options.copy( - update={ - "user": self.user, - "password": self.password.get_secret_value() if self.password is not None else "", - "driver": self.DRIVER, - "url": self.jdbc_url, - }, - ).dict(by_alias=True, **kwargs) - + result = self.jdbc_params + result.update(options.dict(by_alias=True, **kwargs)) return stringify(result) def _options_to_connection_properties(self, options: JDBCMixinOptions): @@ -339,8 +341,7 @@ def _options_to_connection_properties(self, options: JDBCMixinOptions): * https://github.com/apache/spark/blob/v2.3.0/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L248-L255 """ - jdbc_properties = self._get_jdbc_properties(options, exclude_unset=True) - + jdbc_properties = self._get_jdbc_properties(options, exclude_none=True) jdbc_utils_package = self.spark._jvm.org.apache.spark.sql.execution.datasources.jdbc # type: ignore jdbc_options = jdbc_utils_package.JDBCOptions( self.jdbc_url, diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 8fa91b47a..48143191d 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -200,11 +200,14 @@ def package(cls) -> str: @property def jdbc_url(self) -> str: - prop = self.extra.dict(by_alias=True) - prop["databaseName"] = self.database - parameters = ";".join(f"{k}={v}" for k, v in sorted(prop.items())) + return f"jdbc:sqlserver://{self.host}:{self.port}" - return f"jdbc:sqlserver://{self.host}:{self.port};{parameters}" + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + result["databaseName"] = self.database + return result @property def instance_url(self) -> str: diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index da71de55b..a26f8f385 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -138,11 +138,14 @@ def package(cls) -> str: return "com.mysql:mysql-connector-j:8.3.0" @property - def jdbc_url(self): - prop = self.extra.dict(by_alias=True) - parameters = "&".join(f"{k}={v}" for k, v in sorted(prop.items())) - + def jdbc_url(self) -> str: if self.database: - return f"jdbc:mysql://{self.host}:{self.port}/{self.database}?{parameters}" + return f"jdbc:mysql://{self.host}:{self.port}/{self.database}" + + return f"jdbc:mysql://{self.host}:{self.port}" - return f"jdbc:mysql://{self.host}:{self.port}?{parameters}" + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + return result diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index a2d8d35b9..d566fa275 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -226,13 +226,16 @@ def package(cls) -> str: @property def jdbc_url(self) -> str: - extra = self.extra.dict(by_alias=True) - parameters = "&".join(f"{k}={v}" for k, v in sorted(extra.items())) - if self.sid: - return f"jdbc:oracle:thin:@{self.host}:{self.port}:{self.sid}?{parameters}".rstrip("?") + return f"jdbc:oracle:thin:@{self.host}:{self.port}:{self.sid}" + + return f"jdbc:oracle:thin:@//{self.host}:{self.port}/{self.service_name}" - return f"jdbc:oracle:thin:@//{self.host}:{self.port}/{self.service_name}?{parameters}".rstrip("?") + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + return result @property def instance_url(self) -> str: diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 80cddbc11..16d317fee 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -20,6 +20,10 @@ class PostgresExtra(GenericOptions): # allows automatic conversion from text to target column type during write stringtype: str = "unspecified" + # avoid closing connections from server side + # while connector is moving data to executors before insert + tcpKeepAlive: str = "true" # noqa: N815 + class Config: extra = "allow" @@ -142,11 +146,14 @@ def package(cls) -> str: @property def jdbc_url(self) -> str: - extra = self.extra.dict(by_alias=True) - extra["ApplicationName"] = extra.get("ApplicationName", self.spark.sparkContext.appName) + return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}" - parameters = "&".join(f"{k}={v}" for k, v in sorted(extra.items())) - return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}?{parameters}".rstrip("?") + @property + def jdbc_params(self) -> dict[str, str]: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + result["ApplicationName"] = result.get("ApplicationName", self.spark.sparkContext.appName) + return result @property def instance_url(self) -> str: diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 93bd51468..cf135009d 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -5,6 +5,7 @@ import warnings from typing import ClassVar, Optional +from onetl._internal import stringify from onetl._util.classproperty import classproperty from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection @@ -162,12 +163,22 @@ def package(cls) -> str: @property def jdbc_url(self) -> str: - prop = self.extra.dict(by_alias=True) + # Teradata JDBC driver documentation specifically mentions that params from + # java.sql.DriverManager.getConnection(url, params) are used to only retrieve 'user' and 'password' values. + # Other params should be passed via url + properties = self.extra.dict(by_alias=True) if self.database: - prop["DATABASE"] = self.database + properties["DATABASE"] = self.database - prop["DBS_PORT"] = self.port + properties["DBS_PORT"] = self.port - conn = ",".join(f"{k}={v}" for k, v in sorted(prop.items())) - return f"jdbc:teradata://{self.host}/{conn}" + connection_params = [] + for key, value in sorted(properties.items()): + string_value = stringify(value) + if "," in string_value: + connection_params.append(f"{key}='{string_value}'") + else: + connection_params.append(f"{key}={string_value}") + + return f"jdbc:teradata://{self.host}/{','.join(connection_params)}" diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py index 18047d749..78656d834 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py @@ -46,6 +46,21 @@ def test_clickhouse_connection_check_fail(spark): clickhouse.check() +def test_clickhouse_connection_check_extra_is_handled_by_driver(spark, processing): + clickhouse = Clickhouse( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra={"socket_timeout": "wrong_type"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + clickhouse.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_clickhouse_connection_sql(spark, processing, load_table_data, suffix): clickhouse = Clickhouse( diff --git a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py index 4514594c5..5c2d17115 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py @@ -48,6 +48,21 @@ def test_greenplum_connection_check_fail(spark): greenplum.check() +def test_greenplum_connection_check_extra_is_handled_by_driver(spark, processing): + greenplum = Greenplum( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra={**processing.extra, "connectTimeout": "wrong_type"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + greenplum.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_greenplum_connection_fetch(spark, processing, load_table_data, suffix): greenplum = Greenplum( diff --git a/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py b/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py index 9a875671a..4fad8a754 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py @@ -55,6 +55,21 @@ def test_mssql_connection_check_fail(spark): mssql.check() +def test_mssql_connection_check_extra_is_handled_by_driver(spark, processing): + mssql = MSSQL( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra={"trustServerCertificate": "false"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + mssql.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_mssql_connection_sql(spark, processing, load_table_data, suffix): mssql = MSSQL( diff --git a/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py b/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py index 72a6b3b8f..8a4840329 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py @@ -46,6 +46,21 @@ def test_mysql_connection_check_fail(spark): mysql.check() +def test_mysql_connection_check_extra_is_handled_by_driver(spark, processing): + mysql = MySQL( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra={"tcpKeepAlive": "wrong_type"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + mysql.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_mysql_connection_sql(spark, processing, load_table_data, suffix): mysql = MySQL( diff --git a/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py b/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py index 6bd96b259..485ca1911 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py @@ -55,6 +55,22 @@ def test_oracle_connection_check_fail(spark): oracle.check() +def test_oracle_connection_check_extra_is_handled_by_driver(spark, processing): + oracle = Oracle( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + spark=spark, + sid=processing.sid, + service_name=processing.service_name, + extra={"defaultRowPrefetch": "wrong_type"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + oracle.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_oracle_connection_sql(spark, processing, load_table_data, suffix): oracle = Oracle( diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 79fc13ddc..29478b6c9 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -121,6 +121,12 @@ def test_clickhouse(spark_mock): assert conn.database == "database" assert conn.jdbc_url == "jdbc:clickhouse://some_host:8123/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "url": "jdbc:clickhouse://some_host:8123/database", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -144,6 +150,12 @@ def test_clickhouse_with_port(spark_mock): assert conn.database == "database" assert conn.jdbc_url == "jdbc:clickhouse://some_host:5000/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "url": "jdbc:clickhouse://some_host:5000/database", + } def test_clickhouse_without_database(spark_mock): @@ -157,6 +169,12 @@ def test_clickhouse_without_database(spark_mock): assert not conn.database assert conn.jdbc_url == "jdbc:clickhouse://some_host:8123" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "url": "jdbc:clickhouse://some_host:8123", + } def test_clickhouse_with_extra(spark_mock): @@ -165,11 +183,18 @@ def test_clickhouse_with_extra(spark_mock): user="user", password="passwd", database="database", - extra={"socket_timeout": "120000", "query": "SELECT%201%3B"}, + extra={"socket_timeout": 120000, "custom_http_params": "key1=value1,key2=value2"}, spark=spark_mock, ) - assert conn.jdbc_url == "jdbc:clickhouse://some_host:8123/database?query=SELECT%201%3B&socket_timeout=120000" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "url": "jdbc:clickhouse://some_host:8123/database", + "socket_timeout": 120000, + "custom_http_params": "key1=value1,key2=value2", + } def test_clickhouse_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index de24e5ce2..5c824d127 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -119,7 +119,15 @@ def test_greenplum(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&tcpKeepAlive=true" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database", + "ApplicationName": "abc", + "tcpKeepAlive": "true", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -135,7 +143,15 @@ def test_greenplum_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database?ApplicationName=abc&tcpKeepAlive=true" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5000/database", + "ApplicationName": "abc", + "tcpKeepAlive": "true", + } def test_greenplum_without_database_error(spark_mock): @@ -161,9 +177,16 @@ def test_greenplum_with_extra(spark_mock): # `server.*` and `pool.*` options are ignored while generating jdbc_url # they are used only in `read_source_as_df` and `write_df_to_target` - assert conn.jdbc_url == ( - "jdbc:postgresql://some_host:5432/database?ApplicationName=override&autosave=always&tcpKeepAlive=false" - ) + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database", + "ApplicationName": "override", + "tcpKeepAlive": "false", + "autosave": "always", + } def test_greenplum_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index 7c8ecfcca..47148c6b9 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -197,70 +197,6 @@ def test_jdbc_write_options_case(): assert camel_case == snake_case -def test_jdbc_read_options_to_jdbc(spark_mock): - connection = Postgres(host="local", user="admin", database="default", password="1234", spark=spark_mock) - jdbc_params = connection.options_to_jdbc_params( - options=Postgres.ReadOptions( - lowerBound=10, - upperBound=1000, - partitionColumn="some_column", - numPartitions=20, - fetchsize=1000, - sessionInitStatement="BEGIN execute immediate 'alter session set '_serial_direct_read'=true", - snake_case_option="left unchanged", - camelCaseOption="left unchanged", - CamelCaseOption="left unchanged", - ), - ) - - assert jdbc_params == { - "column": "some_column", - "lowerBound": "10", - "numPartitions": "20", - "properties": { - "driver": "org.postgresql.Driver", - "fetchsize": "1000", - "password": "1234", - "sessionInitStatement": "BEGIN execute immediate 'alter session set '_serial_direct_read'=true", - "user": "admin", - "snake_case_option": "left unchanged", - "camelCaseOption": "left unchanged", - "CamelCaseOption": "left unchanged", - }, - "upperBound": "1000", - "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc&stringtype=unspecified", - } - - -def test_jdbc_write_options_to_jdbc(spark_mock): - connection = Postgres(host="local", user="admin", database="default", password="1234", spark=spark_mock) - jdbc_params = connection.options_to_jdbc_params( - options=Postgres.WriteOptions( - batchsize=1000, - truncate=True, - isolation_level="NONE", - snake_case_option="left unchanged", - camelCaseOption="left unchanged", - CamelCaseOption="left unchanged", - ), - ) - - assert jdbc_params == { - "properties": { - "batchsize": "1000", - "driver": "org.postgresql.Driver", - "password": "1234", - "isolationLevel": "NONE", - "truncate": "true", - "user": "admin", - "snake_case_option": "left unchanged", - "camelCaseOption": "left unchanged", - "CamelCaseOption": "left unchanged", - }, - "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc&stringtype=unspecified", - } - - @pytest.mark.parametrize( "options, value", [ diff --git a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py index 51a548166..e1069c20e 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py @@ -92,7 +92,14 @@ def test_mssql(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:sqlserver://some_host:1433;databaseName=database" + assert conn.jdbc_url == "jdbc:sqlserver://some_host:1433" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver", + "url": "jdbc:sqlserver://some_host:1433", + "databaseName": "database", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -108,7 +115,14 @@ def test_mssql_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:sqlserver://some_host:5000;databaseName=database" + assert conn.jdbc_url == "jdbc:sqlserver://some_host:5000" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver", + "url": "jdbc:sqlserver://some_host:5000", + "databaseName": "database", + } def test_mssql_without_database_error(spark_mock): @@ -118,7 +132,6 @@ def test_mssql_without_database_error(spark_mock): user="user", password="passwd", spark=spark_mock, - extra={"trustServerCertificate": "true"}, ) @@ -132,10 +145,16 @@ def test_mssql_with_extra(spark_mock): spark=spark_mock, ) - assert ( - conn.jdbc_url - == "jdbc:sqlserver://some_host:1433;characterEncoding=UTF-8;databaseName=database;trustServerCertificate=true" - ) + assert conn.jdbc_url == "jdbc:sqlserver://some_host:1433" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver", + "url": "jdbc:sqlserver://some_host:1433", + "databaseName": "database", + "characterEncoding": "UTF-8", + "trustServerCertificate": "true", + } def test_mssql_with_extra_prohibited(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py index c071e1196..da9267586 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py @@ -79,7 +79,15 @@ def test_mysql(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:mysql://some_host:3306/database?characterEncoding=UTF-8&useUnicode=yes" + assert conn.jdbc_url == "jdbc:mysql://some_host:3306/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:3306/database", + "characterEncoding": "UTF-8", + "useUnicode": "yes", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -95,7 +103,15 @@ def test_mysql_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:mysql://some_host:5000/database?characterEncoding=UTF-8&useUnicode=yes" + assert conn.jdbc_url == "jdbc:mysql://some_host:5000/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:5000/database", + "characterEncoding": "UTF-8", + "useUnicode": "yes", + } def test_mysql_without_database(spark_mock): @@ -108,7 +124,15 @@ def test_mysql_without_database(spark_mock): assert conn.password.get_secret_value() == "passwd" assert not conn.database - assert conn.jdbc_url == "jdbc:mysql://some_host:3306?characterEncoding=UTF-8&useUnicode=yes" + assert conn.jdbc_url == "jdbc:mysql://some_host:3306" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:3306", + "characterEncoding": "UTF-8", + "useUnicode": "yes", + } def test_mysql_with_extra(spark_mock): @@ -121,10 +145,17 @@ def test_mysql_with_extra(spark_mock): spark=spark_mock, ) - assert conn.jdbc_url == ( - "jdbc:mysql://some_host:3306/database?allowMultiQueries=true&characterEncoding=UTF-8&" - "requireSSL=true&useUnicode=yes" - ) + assert conn.jdbc_url == "jdbc:mysql://some_host:3306/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:3306/database", + "characterEncoding": "UTF-8", + "useUnicode": "yes", + "allowMultiQueries": "true", + "requireSSL": "true", + } conn = MySQL( host="some_host", @@ -135,7 +166,15 @@ def test_mysql_with_extra(spark_mock): spark=spark_mock, ) - assert conn.jdbc_url == ("jdbc:mysql://some_host:3306/database?characterEncoding=CP-1251&useUnicode=no") + assert conn.jdbc_url == "jdbc:mysql://some_host:3306/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:3306/database", + "characterEncoding": "CP-1251", + "useUnicode": "no", + } def test_mysql_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py index cb2b9dc7b..d4db6940e 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py @@ -103,6 +103,12 @@ def test_oracle(spark_mock): assert conn.sid == "sid" assert conn.jdbc_url == "jdbc:oracle:thin:@some_host:1521:sid" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "oracle.jdbc.driver.OracleDriver", + "url": "jdbc:oracle:thin:@some_host:1521:sid", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -119,12 +125,24 @@ def test_oracle_with_port(spark_mock): assert conn.sid == "sid" assert conn.jdbc_url == "jdbc:oracle:thin:@some_host:5000:sid" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "oracle.jdbc.driver.OracleDriver", + "url": "jdbc:oracle:thin:@some_host:5000:sid", + } def test_oracle_uri_with_service_name(spark_mock): conn = Oracle(host="some_host", user="user", password="passwd", service_name="service", spark=spark_mock) assert conn.jdbc_url == "jdbc:oracle:thin:@//some_host:1521/service" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "oracle.jdbc.driver.OracleDriver", + "url": "jdbc:oracle:thin:@//some_host:1521/service", + } def test_oracle_without_sid_and_service_name(spark_mock): @@ -167,7 +185,15 @@ def test_oracle_with_extra(spark_mock): spark=spark_mock, ) - assert conn.jdbc_url == "jdbc:oracle:thin:@some_host:1521:sid?connectTimeout=10&tcpKeepAlive=false" + assert conn.jdbc_url == "jdbc:oracle:thin:@some_host:1521:sid" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "oracle.jdbc.driver.OracleDriver", + "url": "jdbc:oracle:thin:@some_host:1521:sid", + "tcpKeepAlive": "false", + "connectTimeout": "10", + } def test_oracle_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py index f4c00f30f..268525220 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py @@ -79,7 +79,16 @@ def test_postgres(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&stringtype=unspecified" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database", + "ApplicationName": "abc", + "tcpKeepAlive": "true", + "stringtype": "unspecified", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -95,7 +104,16 @@ def test_postgres_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database?ApplicationName=abc&stringtype=unspecified" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5000/database", + "ApplicationName": "abc", + "tcpKeepAlive": "true", + "stringtype": "unspecified", + } def test_postgres_without_database_error(spark_mock): @@ -109,14 +127,28 @@ def test_postgres_with_extra(spark_mock): user="user", password="passwd", database="database", - extra={"ssl": "true", "autosave": "always"}, + extra={ + "stringtype": "VARCHAR", + "autosave": "always", + "tcpKeepAlive": "false", + "ApplicationName": "override", + "ssl": "true", + }, spark=spark_mock, ) - assert ( - conn.jdbc_url - == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&autosave=always&ssl=true&stringtype=unspecified" - ) + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database", + "stringtype": "VARCHAR", + "autosave": "always", + "tcpKeepAlive": "false", + "ApplicationName": "override", + "ssl": "true", + } def test_postgres_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py b/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py index fd90d31d4..b71d7e8d1 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py @@ -82,6 +82,12 @@ def test_teradata(spark_mock): "jdbc:teradata://some_host/CHARSET=UTF8,COLUMN_NAME=ON,DATABASE=database," "DBS_PORT=1025,FLATTEN=ON,MAYBENULL=ON,STRICT_NAMES=OFF" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -101,6 +107,12 @@ def test_teradata_with_port(spark_mock): "jdbc:teradata://some_host/CHARSET=UTF8,COLUMN_NAME=ON,DATABASE=database," "DBS_PORT=5000,FLATTEN=ON,MAYBENULL=ON,STRICT_NAMES=OFF" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } def test_teradata_without_database(spark_mock): @@ -117,6 +129,12 @@ def test_teradata_without_database(spark_mock): "jdbc:teradata://some_host/CHARSET=UTF8,COLUMN_NAME=ON," "DBS_PORT=1025,FLATTEN=ON,MAYBENULL=ON,STRICT_NAMES=OFF" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } def test_teradata_with_extra(spark_mock): @@ -125,14 +143,20 @@ def test_teradata_with_extra(spark_mock): user="user", password="passwd", database="database", - extra={"TMODE": "TERA", "LOGMECH": "LDAP"}, + extra={"TMODE": "TERA", "LOGMECH": "LDAP", "PARAM_WITH_COMMA": "some,value"}, spark=spark_mock, ) assert conn.jdbc_url == ( "jdbc:teradata://some_host/CHARSET=UTF8,COLUMN_NAME=ON,DATABASE=database," - "DBS_PORT=1025,FLATTEN=ON,LOGMECH=LDAP,MAYBENULL=ON,STRICT_NAMES=OFF,TMODE=TERA" + "DBS_PORT=1025,FLATTEN=ON,LOGMECH=LDAP,MAYBENULL=ON,PARAM_WITH_COMMA='some,value',STRICT_NAMES=OFF,TMODE=TERA" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } conn = Teradata( host="some_host", @@ -147,6 +171,12 @@ def test_teradata_with_extra(spark_mock): "jdbc:teradata://some_host/CHARSET=CP-1251,COLUMN_NAME=OFF,DATABASE=database," "DBS_PORT=1025,FLATTEN=OFF,MAYBENULL=OFF,STRICT_NAMES=ON" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } def test_teradata_with_extra_prohibited(spark_mock): From 076d0739fe641407bc76ed19f54658751eef4c21 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Fri, 3 May 2024 16:04:42 +0300 Subject: [PATCH 49/71] [DOP-13846] - implement XML.parse_column (#269) * [DOP-13846] - implement XML.parse_column * [DOP-13845] - add xml documentation * [DOP-13846] - add bypass test and note for rootTag * [DOP-13846] - update test for xml parse column * [DOP-13846] - update test for xml parse column * [DOP-13846] - update test for xml parse column * [DOP-13846] - update parse_column note --- docs/changelog/next_release/269.feature.rst | 1 + .../db_connection/kafka/format_handling.rst | 53 ++++++++ docs/file_df/file_formats/xml.rst | 2 +- onetl/file/format/xml.py | 113 +++++++++++++++++- .../test_xml_integration.py | 49 +++++++- 5 files changed, 215 insertions(+), 3 deletions(-) create mode 100644 docs/changelog/next_release/269.feature.rst diff --git a/docs/changelog/next_release/269.feature.rst b/docs/changelog/next_release/269.feature.rst new file mode 100644 index 000000000..53bb70363 --- /dev/null +++ b/docs/changelog/next_release/269.feature.rst @@ -0,0 +1 @@ +Add ``XML.parse_column`` method for handling XML data within Spark. This method allows for direct parsing of XML strings into structured Spark DataFrame columns. diff --git a/docs/connection/db_connection/kafka/format_handling.rst b/docs/connection/db_connection/kafka/format_handling.rst index bc1993a20..5f2d00864 100644 --- a/docs/connection/db_connection/kafka/format_handling.rst +++ b/docs/connection/db_connection/kafka/format_handling.rst @@ -242,3 +242,56 @@ To serialize structured data into Avro format and write it back to a Kafka topic # | 1|[02 02 02 08 76 6... (binary data)] | # | 2|[02 04 02 08 76 6... (binary data)] | # +---+------------------------------------+ + +XML Format Handling +------------------- + +Handling XML data in Kafka involves parsing string representations of XML into structured Spark DataFrame format. + +``DBReader`` +~~~~~~~~~~~~ + +To process XML formatted data from Kafka, use the :obj:`XML.parse_column ` method. This method allows you to convert a column containing XML strings directly into a structured Spark DataFrame using a specified schema. + +.. code-block:: python + + from pyspark.sql import SparkSession + from pyspark.sql.types import StructType, StructField, StringType, IntegerType + + from onetl.db import DBReader + from onetl.file.format import XML + from onetl.connection import Kafka + + spark = SparkSession.builder.appName("KafkaXMLExample").getOrCreate() + + kafka = Kafka(...) + xml = XML(row_tag="person") + + reader = DBReader( + connection=kafka, + topic="topic_name", + ) + df = reader.run() + + df.show() + # +----+--------------------------------------------------------------------------------------------+----------+---------+------+-----------------------+-------------+ + # |key |value |topic |partition|offset|timestamp |timestampType| + # +----+--------------------------------------------------------------------------------------------+----------+---------+------+-----------------------+-------------+ + # |[31]|"Alice20" |topicXML |0 |0 |2024-04-24 13:02:25.911|0 | + # |[32]|"Bob25" |topicXML |0 |1 |2024-04-24 13:02:25.922|0 | + # +----+--------------------------------------------------------------------------------------------+----------+---------+------+-----------------------+-------------+ + + xml_schema = StructType( + [ + StructField("name", StringType(), nullable=True), + StructField("age", IntegerType(), nullable=True), + ] + ) + parsed_xml_df = df.select(xml.parse_column("value", xml_schema)) + parsed_xml_df.show() + # +-----------+ + # |value | + # +-----------+ + # |{Alice, 20}| + # |{Bob, 25} | + # +-----------+ diff --git a/docs/file_df/file_formats/xml.rst b/docs/file_df/file_formats/xml.rst index 187aa89a4..cfe560ccc 100644 --- a/docs/file_df/file_formats/xml.rst +++ b/docs/file_df/file_formats/xml.rst @@ -6,4 +6,4 @@ XML .. currentmodule:: onetl.file.format.xml .. autoclass:: XML - :members: get_packages + :members: get_packages, parse_column diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 83c02329b..f1dc337b3 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -19,7 +19,8 @@ from onetl.hooks import slot, support_hooks if TYPE_CHECKING: - from pyspark.sql import SparkSession + from pyspark.sql import Column, SparkSession + from pyspark.sql.types import StructType PROHIBITED_OPTIONS = frozenset( @@ -226,3 +227,113 @@ def check_if_supported(self, spark: SparkSession) -> None: if log.isEnabledFor(logging.DEBUG): log.debug("Missing Java class", exc_info=e, stack_info=True) raise ValueError(msg) from e + + def parse_column(self, column: str | Column, schema: StructType) -> Column: + """ + Parses an XML string column into a structured Spark SQL column using the ``from_xml`` function + provided by the `Databricks Spark XML library `_ + based on the provided schema. + + .. note:: + + This method assumes that the ``spark-xml`` package is installed: :obj:`XML.get_packages `. + + .. note:: + + This method parses each DataFrame row individually. Therefore, for a specific column, each row must contain exactly one occurrence of the ``rowTag`` specified. If your XML data includes a root tag that encapsulates multiple row tags, you can adjust the schema to use an ``ArrayType`` to keep all child elements under the single root. + + .. code-block:: xml + + + Book OneAuthor A + Book TwoAuthor B + + + And the corresponding schema in Spark using an ``ArrayType``: + + .. code-block:: python + + from pyspark.sql.types import StructType, StructField, ArrayType, StringType + + schema = StructType( + [ + StructField( + "book", + ArrayType( + StructType( + [ + StructField("title", StringType(), True), + StructField("author", StringType(), True), + ] + ) + ), + True, + ) + ] + ) + + Parameters + ---------- + column : str | Column + The name of the column or the Column object containing XML strings to parse. + + Returns + ------- + Column + A new Column object with data parsed from XML string to the specified structured format. + + Examples + -------- + .. code-block:: python + + from pyspark.sql import SparkSession + from pyspark.sql.types import StructType, StructField, StringType, IntegerType + + from onetl.file.format import XML + + spark = SparkSession.builder.appName("XMLParsingExample").getOrCreate() + schema = StructType( + [ + StructField("author", StringType(), nullable=True), + StructField("title", StringType(), nullable=True), + StructField("genre", StringType(), nullable=True), + StructField("price", IntegerType(), nullable=True), + ] + ) + xml_processor = XML(row_tag="book") + + data = [ + ( + "Austen, JanePride and Prejudiceromance19", + ) + ] + df = spark.createDataFrame(data, ["xml_string"]) + + parsed_df = df.select(xml_processor.parse_column("xml_string", schema=schema)) + parsed_df.show() + + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + + from pyspark.sql.column import _to_java_column # noqa: WPS450 + from pyspark.sql.functions import col + + if isinstance(column, Column): + column_name, column = column._jc.toString(), column.cast("string") # noqa: WPS437 + else: + column_name, column = column, col(column).cast("string") + + java_column = _to_java_column(column) + java_schema = spark._jsparkSession.parseDataType(schema.json()) # noqa: WPS437 + scala_options = spark._jvm.org.apache.spark.api.python.PythonUtils.toScalaMap( # noqa: WPS219, WPS437 + self.dict(), + ) + jc = spark._jvm.com.databricks.spark.xml.functions.from_xml( # noqa: WPS219, WPS437 + java_column, + java_schema, + scala_options, + ) + return Column(jc).alias(column_name) diff --git a/tests/tests_integration/test_file_format_integration/test_xml_integration.py b/tests/tests_integration/test_file_format_integration/test_xml_integration.py index 705d7ff84..5ebaaf1ab 100644 --- a/tests/tests_integration/test_file_format_integration/test_xml_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_xml_integration.py @@ -4,6 +4,8 @@ Do not test all the possible options and combinations, we are not testing Spark here. """ +import datetime + import pytest from onetl._util.spark import get_spark_version @@ -11,9 +13,12 @@ from onetl.file.format import XML try: + from pyspark.sql import Row + from pyspark.sql.functions import col + from tests.util.assert_df import assert_equal_df except ImportError: - pytest.skip("Missing pandas", allow_module_level=True) + pytest.skip("Missing pandas or pyspark", allow_module_level=True) pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.xml] @@ -166,3 +171,45 @@ def test_xml_reader_with_attributes( assert read_df.count() assert read_df.schema == expected_xml_attributes_df.schema assert_equal_df(read_df, expected_xml_attributes_df, order_by="id") + + +@pytest.mark.parametrize( + "xml_input, expected_row", + [ + ( + """ + 1 + Alice + 123 + 2021-01-01 + 2021-01-01T07:01:01Z + 1.23 + """, + Row( + xml_string=Row( + id=1, + str_value="Alice", + int_value=123, + date_value=datetime.date(2021, 1, 1), + datetime_value=datetime.datetime(2021, 1, 1, 7, 1, 1), + float_value=1.23, + ), + ), + ), + ], + ids=["basic-case"], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_xml_parse_column(spark, xml_input: str, expected_row: Row, column_type, file_df_schema): + from onetl.file.format import XML + + spark_version = get_spark_version(spark) + if spark_version.major < 3: + pytest.skip("XML files are supported on Spark 3.x only") + + xml = XML(row_tag="item") + df = spark.createDataFrame([(xml_input,)], ["xml_string"]) + parsed_df = df.select(xml.parse_column(column_type("xml_string"), schema=file_df_schema)) + result_row = parsed_df.first() + + assert result_row == expected_row From 09973cfec107bddfe403754e3645ad2d740734b2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 06:24:06 +0000 Subject: [PATCH 50/71] Bump mikefarah/yq from 4.43.1 to 4.44.1 in the github-actions group Bumps the github-actions group with 1 update: [mikefarah/yq](https://github.com/mikefarah/yq). Updates `mikefarah/yq` from 4.43.1 to 4.44.1 - [Release notes](https://github.com/mikefarah/yq/releases) - [Changelog](https://github.com/mikefarah/yq/blob/master/release_notes.txt) - [Commits](https://github.com/mikefarah/yq/compare/v4.43.1...v4.44.1) --- updated-dependencies: - dependency-name: mikefarah/yq dependency-type: direct:production update-type: version-update:semver-minor dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index 5cf33028e..8f024cf88 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -154,7 +154,7 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml @@ -184,7 +184,7 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml @@ -214,7 +214,7 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml @@ -244,7 +244,7 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml @@ -274,7 +274,7 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml @@ -304,7 +304,7 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml @@ -334,7 +334,7 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml @@ -364,7 +364,7 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml @@ -394,7 +394,7 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml @@ -424,7 +424,7 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml @@ -454,7 +454,7 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml @@ -484,7 +484,7 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml @@ -514,7 +514,7 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml @@ -544,7 +544,7 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml @@ -574,7 +574,7 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml @@ -604,7 +604,7 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml @@ -634,7 +634,7 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml @@ -664,7 +664,7 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.43.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml From 3cd5dfcf90f2b110a24f1415fd4aae3714e02f40 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Thu, 16 May 2024 16:04:44 +0300 Subject: [PATCH 51/71] [DOP-15749] - separate ReadOptions into ReadOptions and SQLOptions (#272) * [DOP-15749] - separate ReadOptions into ReadOptions and SQLOptions * [DOP-15749] - update docs & tests * [DOP-15749] - update docs & tests * Update onetl/connection/db_connection/jdbc_connection/options.py Co-authored-by: Maxim Martynov --------- Co-authored-by: Maxim Martynov --- docs/changelog/next_release/272.feature.rst | 47 ++++++++ .../db_connection/clickhouse/sql.rst | 9 +- docs/connection/db_connection/mssql/sql.rst | 9 +- docs/connection/db_connection/mysql/sql.rst | 9 +- docs/connection/db_connection/oracle/sql.rst | 9 +- .../connection/db_connection/postgres/sql.rst | 9 +- .../connection/db_connection/teradata/sql.rst | 9 +- .../jdbc_connection/connection.py | 25 ++++- .../db_connection/jdbc_connection/options.py | 103 ++++++++++++++++++ .../test_postgres_integration.py | 44 ++++++++ .../test_jdbc_options_unit.py | 24 ++++ 11 files changed, 285 insertions(+), 12 deletions(-) create mode 100644 docs/changelog/next_release/272.feature.rst diff --git a/docs/changelog/next_release/272.feature.rst b/docs/changelog/next_release/272.feature.rst new file mode 100644 index 000000000..eb33bfd2c --- /dev/null +++ b/docs/changelog/next_release/272.feature.rst @@ -0,0 +1,47 @@ +``ReadOptions`` and ``SQLOptions`` have been separated for JDBC connections. ``SQLOptions`` are recommended for the ``.sql`` method in JDBC connections. +``SQLOptions`` do not support ``partitioning_mode`` and require explicit definition of ``lower_bound`` and ``upper_bound`` when ``num_partitions`` is greater than 1. +``ReadOptions`` allow the inclusion of ``partitioning_mode`` and automatically handle ``lower_bound`` and ``upper_bound`` based on the data distribution + + +Before: + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.ReadOptions( + partitioning_mode="range", + partition_column="id", + num_partitions=10, + ), + ) + +After: + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.SQLOptions( + # partitioning_mode is not supported! + partition_column="id", + num_partitions=10, + # this should be set explicitly! + lower_bound=0, + upper_bound=1000, + ), + ) diff --git a/docs/connection/db_connection/clickhouse/sql.rst b/docs/connection/db_connection/clickhouse/sql.rst index 1a3a1d52a..a81eda8b9 100644 --- a/docs/connection/db_connection/clickhouse/sql.rst +++ b/docs/connection/db_connection/clickhouse/sql.rst @@ -43,7 +43,7 @@ Examples WHERE key = 'something' """, - options=Clickhouse.ReadOptions( + options=Clickhouse.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -66,3 +66,10 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options + +.. autopydantic_model:: JDBCSQLOptions diff --git a/docs/connection/db_connection/mssql/sql.rst b/docs/connection/db_connection/mssql/sql.rst index 8a59d376a..de4f6fe63 100644 --- a/docs/connection/db_connection/mssql/sql.rst +++ b/docs/connection/db_connection/mssql/sql.rst @@ -43,7 +43,7 @@ Examples WHERE key = 'something' """, - options=MSSQL.ReadOptions( + options=MSSQL.QLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -66,3 +66,10 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options + +.. autopydantic_model:: JDBCSQLOptions diff --git a/docs/connection/db_connection/mysql/sql.rst b/docs/connection/db_connection/mysql/sql.rst index c161efa83..949fb3aa3 100644 --- a/docs/connection/db_connection/mysql/sql.rst +++ b/docs/connection/db_connection/mysql/sql.rst @@ -44,7 +44,7 @@ Examples WHERE key = 'something' """, - options=MySQL.ReadOptions( + options=MySQL.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -67,3 +67,10 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from MySQL to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options + +.. autopydantic_model:: JDBCSQLOptions diff --git a/docs/connection/db_connection/oracle/sql.rst b/docs/connection/db_connection/oracle/sql.rst index 969c28afa..5a94daa42 100644 --- a/docs/connection/db_connection/oracle/sql.rst +++ b/docs/connection/db_connection/oracle/sql.rst @@ -44,7 +44,7 @@ Examples WHERE key = 'something' """, - options=Oracle.ReadOptions( + options=Oracle.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -67,3 +67,10 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options + +.. autopydantic_model:: JDBCSQLOptions diff --git a/docs/connection/db_connection/postgres/sql.rst b/docs/connection/db_connection/postgres/sql.rst index 3f762f1ad..f64cf528e 100644 --- a/docs/connection/db_connection/postgres/sql.rst +++ b/docs/connection/db_connection/postgres/sql.rst @@ -43,7 +43,7 @@ Examples WHERE key = 'something' """, - options=Postgres.ReadOptions( + options=Postgres.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -66,3 +66,10 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options + +.. autopydantic_model:: JDBCSQLOptions diff --git a/docs/connection/db_connection/teradata/sql.rst b/docs/connection/db_connection/teradata/sql.rst index 3531d01c3..98b03e107 100644 --- a/docs/connection/db_connection/teradata/sql.rst +++ b/docs/connection/db_connection/teradata/sql.rst @@ -41,7 +41,7 @@ Examples WHERE key = 'something' """, - options=Teradata.ReadOptions( + options=Teradata.SQLOptions( partition_column="part_column", num_partitions=10, lower_bound=0, @@ -64,3 +64,10 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from Teradata to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options + +.. autopydantic_model:: JDBCSQLOptions diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 616e5fd29..3c66a20d7 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -4,6 +4,7 @@ import logging import secrets +import warnings from typing import TYPE_CHECKING, Any from etl_entities.instance import Host @@ -15,6 +16,7 @@ JDBCLegacyOptions, JDBCPartitioningMode, JDBCReadOptions, + JDBCSQLOptions, JDBCTableExistBehavior, JDBCWriteOptions, ) @@ -50,6 +52,7 @@ class JDBCConnection(JDBCMixin, DBConnection): Dialect = JDBCDialect ReadOptions = JDBCReadOptions + SQLOptions = JDBCSQLOptions WriteOptions = JDBCWriteOptions Options = JDBCLegacyOptions @@ -61,7 +64,7 @@ def instance_url(self) -> str: def sql( self, query: str, - options: JDBCReadOptions | dict | None = None, + options: JDBCSQLOptions | dict | None = None, ) -> DataFrame: """ **Lazily** execute SELECT statement **on Spark executor** and return DataFrame. |support_hooks| @@ -74,7 +77,7 @@ def sql( SQL query to be executed. - options : dict, :obj:`~ReadOptions`, default: ``None`` + options : dict, :obj:`~SQLOptions`, default: ``None`` Spark options to be used while fetching data. @@ -86,12 +89,17 @@ def sql( """ + if isinstance(options, JDBCReadOptions): + msg = "Using `ReadOptions` for `sql` method is deprecated, use `SQLOptions` instead." + warnings.warn(msg, UserWarning, stacklevel=3) + options = self.SQLOptions.parse_obj(options.dict(exclude={"partitioning_mode"}, exclude_none=True)) + query = clear_statement(query) log.info("|%s| Executing SQL query (on executor):", self.__class__.__name__) log_lines(log, query) - df = self._query_on_executor(query, self.ReadOptions.parse(options)) + df = self._query_on_executor(query, self.SQLOptions.parse(options)) log.info("|Spark| DataFrame successfully created from SQL statement ") return df @@ -151,7 +159,12 @@ def read_source_as_df( limit=limit, ) - result = self.sql(query, read_options) + log.info("|%s| Executing SQL query (on executor):", self.__class__.__name__) + log_lines(log, query) + + result = self._query_on_executor(query, self.ReadOptions.parse(read_options)) + + log.info("|Spark| DataFrame successfully created from SQL statement ") if alias: result = result.drop(alias) @@ -241,9 +254,9 @@ def get_min_max_values( def _query_on_executor( self, query: str, - options: JDBCReadOptions, + options: JDBCSQLOptions | JDBCReadOptions, ) -> DataFrame: - jdbc_properties = self._get_jdbc_properties(options, exclude={"partitioning_mode"}, exclude_none=True) + jdbc_properties = self._get_jdbc_properties(options, exclude_none=True) return self.spark.read.format("jdbc").options(dbtable=f"({query}) T", **jdbc_properties).load() def _exclude_partition_options( diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index 50eda6a51..433eaa9fe 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -515,6 +515,109 @@ def _mode_is_deprecated(cls, values): return values +class JDBCSQLOptions(JDBCOptions): + """Options specifically for SQL queries + + These options allow you to specify configurations for executing SQL queries + without relying on Spark's partitioning mechanisms. + + .. note:: + + You can pass any JDBC configuration + `supported by Spark `_, + tailored to optimize SQL query execution. Option names should be in ``camelCase``! + + """ + + partition_column: Optional[str] = None + """Column used to partition data across multiple executors for parallel query processing. + + .. warning:: + It is highly recommended to use primary key, or at least a column with an index + to avoid performance issues. + + Example of using partition_column for range-based partitioning: + + .. code-block:: sql + + -- If partition_column is 'id', with num_partitions=4, lower_bound=1, and upper_bound=100: + -- Executor 1 processes IDs from 1 to 25 + SELECT ... FROM table WHERE id >= 1 AND id < 26 + -- Executor 2 processes IDs from 26 to 50 + SELECT ... FROM table WHERE id >= 26 AND id < 51 + -- Executor 3 processes IDs from 51 to 75 + SELECT ... FROM table WHERE id >= 51 AND id < 76 + -- Executor 4 processes IDs from 76 to 100 + SELECT ... FROM table WHERE id >= 76 AND id <= 100 + + + -- General case for Executor N + SELECT ... FROM table + WHERE partition_column >= (lower_bound + (N-1) * stride) + AND partition_column <= upper_bound + -- Where ``stride`` is calculated as ``(upper_bound - lower_bound) / num_partitions``. + """ + + num_partitions: Optional[int] = None + """Number of jobs created by Spark to read the table content in parallel.""" # noqa: WPS322 + + lower_bound: Optional[int] = None + """Defines the starting boundary for partitioning the query's data. Mandatory if :obj:`~partition_column~ is set""" # noqa: WPS322 + + upper_bound: Optional[int] = None + """Sets the ending boundary for data partitioning. Mandatory if :obj:`~partition_column~ is set""" # noqa: WPS322 + + session_init_statement: Optional[str] = None + '''After each database session is opened to the remote DB and before starting to read data, + this option executes a custom SQL statement (or a PL/SQL block). + + Use this to implement session initialization code. + + Example: + + .. code:: python + + sessionInitStatement = """ + BEGIN + execute immediate + 'alter session set "_serial_direct_read"=true'; + END; + """ + ''' + + fetchsize: int = 100_000 + """Fetch N rows from an opened cursor per one read round. + + Tuning this option can influence performance of reading. + + .. warning:: + + Default value is different from Spark. + + Spark uses driver's own value, and it may be different in different drivers, + and even versions of the same driver. For example, Oracle has + default ``fetchsize=10``, which is absolutely not usable. + + Thus we've overridden default value with ``100_000``, which should increase reading performance. + """ + + class Config: + known_options = READ_OPTIONS - {"partitioning_mode"} + prohibited_options = JDBCOptions.Config.prohibited_options | {"partitioning_mode"} + alias_generator = to_camel + + @root_validator(pre=True) + def _check_partition_fields(cls, values): + num_partitions = values.get("num_partitions") + lower_bound = values.get("lower_bound") + upper_bound = values.get("upper_bound") + + if num_partitions is not None and num_partitions > 1: + if lower_bound is None or upper_bound is None: + raise ValueError("lower_bound and upper_bound must be set if num_partitions > 1") + return values + + @deprecated( "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'ReadOptions' or 'WriteOptions' instead", category=UserWarning, diff --git a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py index 1fb74095f..526fe1ee6 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py @@ -961,3 +961,47 @@ def function_finalizer(): df = postgres.sql(f"SELECT {func}(2, 'cde') AS result") result_df = pandas.DataFrame([[2]], columns=["result"]) processing.assert_equal_df(df=df, other_frame=result_df) + + +@pytest.mark.parametrize( + "options_class, options_kwargs, expected_warning", + [ + (Postgres.ReadOptions, {"fetchsize": 5000, "sessionInitStatement": "SET timezone TO 'UTC'"}, UserWarning), + (Postgres.SQLOptions, {"fetchsize": 5000, "sessionInitStatement": "SET timezone TO 'UTC'"}, None), + ], +) +def test_postgres_connection_sql_options( + options_class, + options_kwargs, + expected_warning, + spark, + processing, + load_table_data, +): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + table = load_table_data.full_name + options = options_class(**options_kwargs) + + if expected_warning: + with pytest.warns( + expected_warning, + match="Using `ReadOptions` for `sql` method is deprecated, use `SQLOptions` instead.", + ): + df = postgres.sql(f"SELECT * FROM {table}", options=options) + else: + df = postgres.sql(f"SELECT * FROM {table}", options=options) + + table_df = processing.get_expected_dataframe( + schema=load_table_data.schema, + table=load_table_data.table, + order_by="id_int", + ) + + processing.assert_equal_df(df=df, other_frame=table_df) diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index 47148c6b9..e793966ae 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -261,3 +261,27 @@ def test_jdbc_write_options_mode_deprecated(options, value, message): def test_jdbc_write_options_mode_wrong(options): with pytest.raises(ValueError, match="value is not a valid enumeration member"): Postgres.WriteOptions(**options) + + +@pytest.mark.parametrize( + "options, expected_message", + [ + ({"num_partitions": 2}, "lower_bound and upper_bound must be set if num_partitions > 1"), + ({"num_partitions": 2, "lower_bound": 0}, "lower_bound and upper_bound must be set if num_partitions > 1"), + ({"num_partitions": 2, "upper_bound": 10}, "lower_bound and upper_bound must be set if num_partitions > 1"), + ], +) +def test_jdbc_sql_options_partition_bounds(options, expected_message): + with pytest.raises(ValueError, match=expected_message): + Postgres.SQLOptions(**options) + + +def test_jdbc_sql_options_partitioning_mode_prohibited(): + with pytest.raises(ValueError, match=r"Options \['partitioning_mode'\] are not allowed"): + Postgres.SQLOptions(partitioning_mode="range") + + +def test_jdbc_sql_options_default(): + options = Postgres.SQLOptions() + assert options.fetchsize == 100_000 + assert options.query_timeout is None From 24660e56700874fd3e912e4ef9ea26ea013c03b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 17 May 2024 14:59:23 +0000 Subject: [PATCH 52/71] [DOP-14059] Add Troubleshooting guide --- .../next_release/275.improvement.rst | 1 + .../spark_s3/troubleshooting.rst | 20 +-- docs/index.rst | 9 +- docs/logging.rst | 152 +++++++++++++++++- docs/troubleshooting/index.rst | 25 +++ docs/troubleshooting/spark.rst | 79 +++++++++ 6 files changed, 263 insertions(+), 23 deletions(-) create mode 100644 docs/changelog/next_release/275.improvement.rst create mode 100644 docs/troubleshooting/index.rst create mode 100644 docs/troubleshooting/spark.rst diff --git a/docs/changelog/next_release/275.improvement.rst b/docs/changelog/next_release/275.improvement.rst new file mode 100644 index 000000000..5985618a0 --- /dev/null +++ b/docs/changelog/next_release/275.improvement.rst @@ -0,0 +1 @@ +Add generic ``Troubleshooting`` guide. diff --git a/docs/connection/file_df_connection/spark_s3/troubleshooting.rst b/docs/connection/file_df_connection/spark_s3/troubleshooting.rst index 2f3017547..e3474c20e 100644 --- a/docs/connection/file_df_connection/spark_s3/troubleshooting.rst +++ b/docs/connection/file_df_connection/spark_s3/troubleshooting.rst @@ -3,6 +3,10 @@ Spark S3 Troubleshooting ======================== +.. note:: + + General guide: :ref:`troubleshooting`. + More details: * `Hadoop AWS Troubleshooting Guide `_ @@ -34,12 +38,7 @@ How to determine reason Make logging more verbose ^^^^^^^^^^^^^^^^^^^^^^^^^ -Change Spark session log level to ``DEBUG`` to print result of each attempt: - -.. code:: python - - spark.sparkContext.setLogLevel("debug") - +Change Spark session log level to :ref:`DEBUG ` to print result of each attempt. Resulting logs will look like this .. dropdown:: See log @@ -171,15 +170,6 @@ Resulting logs will look like this 23/08/03 11:25:10 DEBUG request: Retrying Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) 23/08/03 11:25:10 DEBUG AmazonHttpClient: Retriable error detected, will retry in 49ms, attempt number: 0 -After getting all information you need, make logs less verbose: - -.. code:: python - - spark.sparkContext.setLogLevel("info") - - # or - spark.sparkContext.setLogLevel("warn") - Change number of retries ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/index.rst b/docs/index.rst index 71d3fc250..479a7c72c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,6 +17,8 @@ install/index quickstart concepts + logging + troubleshooting/index .. toctree:: :maxdepth: 3 @@ -62,13 +64,6 @@ hooks/index plugins -.. toctree:: - :maxdepth: 2 - :caption: Misc - :hidden: - - logging - .. toctree:: :maxdepth: 2 :caption: Development diff --git a/docs/logging.rst b/docs/logging.rst index 6fc4eda41..1198a64d1 100644 --- a/docs/logging.rst +++ b/docs/logging.rst @@ -1,7 +1,157 @@ .. _logging: Logging -========= +======= + +Logging is quite important to understant what's going on under the hood of onETL. + +Default logging level for Python interpreters is ``WARNING``, +but most of onETL logs are in ``INFO`` level, so users usually don't see much. + +To change logging level, there is a function :obj:`setup_logging ` +which should be called at the top of the script: + +.. code:: python + + from onetl.log import setup_logging + from other.lib import some, more, imports + + setup_logging() + + # rest of code + ... + +This changes both log level and log formatting to something like this: + +.. dropdown:: See logs + + .. code:: text + + 2024-04-12 10:12:10,834 [INFO ] MainThread: |onETL| Using IncrementalStrategy as a strategy + 2024-04-12 10:12:10,835 [INFO ] MainThread: =================================== DBReader.run() starts =================================== + 2024-04-12 10:12:10,835 [INFO ] MainThread: |DBReader| Getting Spark type for HWM expression: 'updated_at' + 2024-04-12 10:12:10,836 [INFO ] MainThread: |MSSQL| Fetching schema of table 'source_schema.table' ... + 2024-04-12 10:12:11,636 [INFO ] MainThread: |MSSQL| Schema fetched. + 2024-04-12 10:12:11,642 [INFO ] MainThread: |DBReader| Got Spark field: StructField('updated_at', TimestampType(), True) + 2024-04-12 10:12:11,642 [INFO ] MainThread: |DBReader| Detected HWM type: 'ColumnDateTimeHWM' + 2024-04-12 10:12:11,643 [INFO ] MainThread: |IncrementalStrategy| Fetching HWM from HorizonHWMStore: + 2024-04-12 10:12:11,643 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb' + 2024-04-12 10:12:12,181 [INFO ] MainThread: |IncrementalStrategy| Fetched HWM: + 2024-04-12 10:12:12,182 [INFO ] MainThread: hwm = ColumnDateTimeHWM( + 2024-04-12 10:12:12,182 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 10:12:12,182 [INFO ] MainThread: entity = 'source_schema.table', + 2024-04-12 10:12:12,182 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 10:12:12,184 [INFO ] MainThread: value = datetime.datetime(2024, 4, 11, 18, 10, 2, 120000), + 2024-04-12 10:12:12,184 [INFO ] MainThread: ) + 2024-04-12 10:12:12,184 [INFO ] MainThread: |MSSQL| -> |Spark| Reading DataFrame from source using parameters: + 2024-04-12 10:12:12,185 [INFO ] MainThread: source = 'source_schema.table' + 2024-04-12 10:12:12,185 [INFO ] MainThread: columns = [ + 2024-04-12 10:12:12,185 [INFO ] MainThread: 'id', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'new_value', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'old_value', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'updated_at', + 2024-04-12 10:12:12,186 [INFO ] MainThread: ] + 2024-04-12 10:12:12,187 [INFO ] MainThread: where = "field = 'some'" + 2024-04-12 10:12:12,187 [INFO ] MainThread: hwm = AutoDetectHWM( + 2024-04-12 10:12:12,187 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 10:12:12,187 [INFO ] MainThread: entity = 'source_schema.table', + 2024-04-12 10:12:12,187 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 10:12:12,188 [INFO ] MainThread: ) + 2024-04-12 10:12:12,188 [INFO ] MainThread: options = { + 2024-04-12 10:12:12,188 [INFO ] MainThread: 'fetchsize': 100000, + 2024-04-12 10:12:12,188 [INFO ] MainThread: 'numPartitions': 1, + 2024-04-12 10:12:12,189 [INFO ] MainThread: 'partitioningMode': 'range', + 2024-04-12 10:12:12,189 [INFO ] MainThread: } + 2024-04-12 10:12:12,189 [INFO ] MainThread: |MSSQL| Checking connection availability... + 2024-04-12 10:12:12,189 [INFO ] MainThread: |MSSQL| Using connection parameters: + 2024-04-12 10:12:12,190 [INFO ] MainThread: user = 'db_user' + 2024-04-12 10:12:12,190 [INFO ] MainThread: password = SecretStr('**********') + 2024-04-12 10:12:12,190 [INFO ] MainThread: host = 'mssql.host' + 2024-04-12 10:12:12,190 [INFO ] MainThread: port = 1433 + 2024-04-12 10:12:12,191 [INFO ] MainThread: database = 'somedb' + 2024-04-12 10:12:12,191 [INFO ] MainThread: extra = {'ApplicationIntent': 'ReadOnly', 'trustServerCertificate': 'true'} + 2024-04-12 10:12:12,191 [INFO ] MainThread: jdbc_url = 'jdbc:sqlserver:/mssql.host:1433' + 2024-04-12 10:12:12,579 [INFO ] MainThread: |MSSQL| Connection is available. + 2024-04-12 10:12:12,581 [INFO ] MainThread: |MSSQL| Executing SQL query (on driver): + 2024-04-12 10:12:12,581 [INFO ] MainThread: SELECT + 2024-04-12 10:12:12,581 [INFO ] MainThread: MIN(updated_at) AS "min", + 2024-04-12 10:12:12,582 [INFO ] MainThread: MAX(updated_at) AS "max" + 2024-04-12 10:12:12,582 [INFO ] MainThread: FROM + 2024-04-12 10:12:12,582 [INFO ] MainThread: source_schema.table + 2024-04-12 10:12:12,582 [INFO ] MainThread: WHERE + 2024-04-12 10:12:12,582 [INFO ] MainThread: (field = 'some') + 2024-04-12 10:12:12,583 [INFO ] MainThread: AND + 2024-04-12 10:12:12,583 [INFO ] MainThread: (updated_at >= CAST('2024-04-11T18:10:02.120000' AS datetime2)) + 2024-04-12 10:16:22,537 [INFO ] MainThread: |MSSQL| Received values: + 2024-04-12 10:16:22,538 [INFO ] MainThread: MIN(updated_at) = datetime.datetime(2024, 4, 11, 21, 10, 7, 397000) + 2024-04-12 10:16:22,538 [INFO ] MainThread: MAX(updated_at) = datetime.datetime(2024, 4, 12, 13, 12, 2, 123000) + 2024-04-12 10:16:22,540 [INFO ] MainThread: |MSSQL| Executing SQL query (on executor): + 2024-04-12 10:16:22,540 [INFO ] MainThread: SELECT + 2024-04-12 10:16:22,540 [INFO ] MainThread: id, + 2024-04-12 10:16:22,541 [INFO ] MainThread: new_value, + 2024-04-12 10:16:22,541 [INFO ] MainThread: old_value, + 2024-04-12 10:16:22,541 [INFO ] MainThread: updated_at + 2024-04-12 10:16:22,541 [INFO ] MainThread: FROM + 2024-04-12 10:16:22,541 [INFO ] MainThread: source_schema.table + 2024-04-12 10:16:22,542 [INFO ] MainThread: WHERE + 2024-04-12 10:16:22,542 [INFO ] MainThread: (field = 'some') + 2024-04-12 10:16:22,542 [INFO ] MainThread: AND + 2024-04-12 10:16:22,542 [INFO ] MainThread: (updated_at > CAST('2024-04-11T18:10:02.120000' AS datetime2)) + 2024-04-12 10:16:22,542 [INFO ] MainThread: AND + 2024-04-12 10:16:22,542 [INFO ] MainThread: (updated_at <= CAST('2024-04-12T13:12:02.123000' AS datetime2)) + 2024-04-12 10:16:22,892 [INFO ] MainThread: |Spark| DataFrame successfully created from SQL statement + 2024-04-12 10:16:22,892 [INFO ] MainThread: ------------------------------------ DBReader.run() ends ------------------------------------ + 2024-04-12 10:40:42,409 [INFO ] MainThread: =================================== DBWriter.run() starts =================================== + 2024-04-12 10:40:42,409 [INFO ] MainThread: |Spark| -> |Hive| Writing DataFrame to target using parameters: + 2024-04-12 10:40:42,410 [INFO ] MainThread: target = 'target_source_schema.table' + 2024-04-12 10:40:42,410 [INFO ] MainThread: options = { + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'mode': 'append', + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'format': 'orc', + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'partitionBy': 'part_dt', + 2024-04-12 10:40:42,410 [INFO ] MainThread: } + 2024-04-12 10:40:42,411 [INFO ] MainThread: df_schema: + 2024-04-12 10:40:42,412 [INFO ] MainThread: root + 2024-04-12 10:40:42,412 [INFO ] MainThread: |-- id: integer (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- new_value: string (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- old_value: string (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- updated_at: timestamp (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- part_dt: date (nullable = true) + 2024-04-12 10:40:42,414 [INFO ] MainThread: + 2024-04-12 10:40:42,421 [INFO ] MainThread: |Hive| Checking connection availability... + 2024-04-12 10:40:42,421 [INFO ] MainThread: |Hive| Using connection parameters: + 2024-04-12 10:40:42,421 [INFO ] MainThread: cluster = 'dwh' + 2024-04-12 10:40:42,475 [INFO ] MainThread: |Hive| Connection is available. + 2024-04-12 10:40:42,476 [INFO ] MainThread: |Hive| Fetching schema of table 'target_source_schema.table' ... + 2024-04-12 10:40:43,518 [INFO ] MainThread: |Hive| Schema fetched. + 2024-04-12 10:40:43,521 [INFO ] MainThread: |Hive| Table 'target_source_schema.table' already exists + 2024-04-12 10:40:43,521 [WARNING ] MainThread: |Hive| User-specified options {'partitionBy': 'part_dt'} are ignored while inserting into existing table. Using only table parameters from Hive metastore + 2024-04-12 10:40:43,782 [INFO ] MainThread: |Hive| Inserting data into existing table 'target_source_schema.table' ... + 2024-04-12 11:06:07,396 [INFO ] MainThread: |Hive| Data is successfully inserted into table 'target_source_schema.table'. + 2024-04-12 11:06:07,397 [INFO ] MainThread: ------------------------------------ DBWriter.run() ends ------------------------------------ + 2024-04-12 11:06:07,397 [INFO ] MainThread: |onETL| Exiting IncrementalStrategy + 2024-04-12 11:06:07,397 [INFO ] MainThread: |IncrementalStrategy| Saving HWM to 'HorizonHWMStore': + 2024-04-12 11:06:07,397 [INFO ] MainThread: hwm = ColumnDateTimeHWM( + 2024-04-12 11:06:07,397 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 11:06:07,397 [INFO ] MainThread: entity = 'source_source_schema.table', + 2024-04-12 11:06:07,397 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 11:06:07,397 [INFO ] MainThread: value = datetime.datetime(2024, 4, 12, 13, 12, 2, 123000), + 2024-04-12 11:06:07,397 [INFO ] MainThread: ) + 2024-04-12 11:06:07,495 [INFO ] MainThread: |IncrementalStrategy| HWM has been saved + +Each step performed by onETL is extensively logged, which should help with debugging. + +You can make logs even more verbose by changing level to ``DEBUG``: + +.. code:: python + + from onetl.log import setup_logging + + setup_logging(level="DEBUG", enable_clients=True) + + # rest of code + ... + +This also changes log level for all underlying Python libraries, e.g. showing each HTTP request being made, and so on. .. currentmodule:: onetl.log diff --git a/docs/troubleshooting/index.rst b/docs/troubleshooting/index.rst new file mode 100644 index 000000000..536170561 --- /dev/null +++ b/docs/troubleshooting/index.rst @@ -0,0 +1,25 @@ +.. _troubleshooting: + +Troubleshooting +=============== + +In case of error please follow instructions below: + +* Read the logs or exception messages you've faced with. + * If Python logs are note verbose enough, :ref:`increase the log level `. + * If Spark logs are note verbose enough, :ref:`increase the log level `. +* Read documentation related to a class or method you are using. +* `Google `_ the error message, and carefully read the search result: + * `StackOverflow `_ answers. + * `Spark `_ documentation. + * Documentation of database or filesystem you are connecting to. + * Documentation of underlying connector. +* Search for known `issues `_, or create a new one. +* Always use the most resent versions of onETL, PySpark and connector packages, :ref:`compatible with your environment `. + +.. toctree:: + :maxdepth: 3 + :caption: Troubleshooting + :hidden: + + spark diff --git a/docs/troubleshooting/spark.rst b/docs/troubleshooting/spark.rst new file mode 100644 index 000000000..0c1a20eb9 --- /dev/null +++ b/docs/troubleshooting/spark.rst @@ -0,0 +1,79 @@ +.. _troubleshooting-spark: + +Spark Troubleshooting +===================== + +Restarting Spark session +------------------------ + +Sometimes it is required to stop current Spark session and start a new one, e.g. to add some .jar packages, or change session config. +But PySpark not only starts Spark session, but also starts Java virtual machine (JVM) process in the background, +which. So calling ``sparkSession.stop()`` `does not shutdown JVM `_, +and this can cause some issue. + +Also apart from JVM properties, stopping Spark session does not clear Spark context, which is a global object. So new +Spark sessions are created using the same context object, and thus using the same Spark config options. + +To properly stop Spark session, it is **required** to: +* Stop Spark session by calling ``sparkSession.stop()``. +* **STOP PYTHON INTERPRETER**, e.g. by calling ``sys.exit()``. +* Start new Python interpreter. +* Start new Spark session with config options you need. + +Skipping some of these steps can lead to issues with creating new Spark session. + +Driver log level +---------------- + +Default logging level for Spark session is ``WARN``. To show more verbose logs, use: + +.. code:: python + + spark.sparkContext.setLogLevel("INFO") + +or increase verbosity even more: + +.. code:: python + + spark.sparkContext.setLogLevel("DEBUG") + +After getting all information you need, you can return back the previous log level: + +.. code:: python + + spark.sparkContext.setLogLevel("WARN") + +Executors log level +------------------- + +``sparkContext.setLogLevel`` changes only log level of Spark session on Spark **driver**. +To make Spark executor logs more verbose, perform following steps: + +* Create ``log4j.properties`` file with content like this: + + .. code-block:: jproperties + + log4j.rootCategory=DEBUG, console + + log4j.appender.console=org.apache.log4j.ConsoleAppender + log4j.appender.console.target=System.err + log4j.appender.console.layout=org.apache.log4j.PatternLayout + log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +* Stop existing Spark session and create a new one with following options: + + .. code-block:: python + + from pyspark.sql import SparkSession + + spark = ( + SparkSesion.builder.config("spark.files", "file:log4j.properties").config( + "spark.executor.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties" + ) + # you can apply the same logging settings to Spark driver, by uncommenting the line below + # .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties") + .getOrCreate() + ) + +Each Spark executor will receive a copy of ``log4j.properties`` file during start, and load it to change own log level. +Same approach can be used for Spark driver as well, to investigate issue when Spark session cannot properly start. From bacb483aae3a058cfd22696903d3f6029061108f Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Mon, 20 May 2024 17:28:32 +0300 Subject: [PATCH 53/71] [DOP-15749] - add ExecuteOptions, FetchOptions (#274) --- docs/changelog/next_release/274.feature.rst | 37 ++++++++++ docs/conf.py | 1 - .../db_connection/clickhouse/execute.rst | 18 +++-- .../db_connection/greenplum/execute.rst | 18 +++-- .../db_connection/mssql/execute.rst | 18 +++-- .../db_connection/mysql/execute.rst | 18 +++-- .../db_connection/oracle/execute.rst | 16 +++-- .../db_connection/postgres/execute.rst | 16 +++-- .../db_connection/teradata/execute.rst | 16 +++-- .../db_connection/greenplum/connection.py | 12 ++-- .../db_connection/jdbc_mixin/connection.py | 47 +++++++++---- .../db_connection/jdbc_mixin/options.py | 69 +++++++++++++++++++ .../db_connection/oracle/connection.py | 13 ++-- .../db_connection/postgres/connection.py | 11 ++- .../test_greenplum_integration.py | 7 +- .../test_postgres_integration.py | 34 ++++++++- .../test_db_options_unit.py | 6 +- .../test_greenplum_unit.py | 10 +-- .../test_jdbc_options_unit.py | 14 +++- 19 files changed, 302 insertions(+), 79 deletions(-) create mode 100644 docs/changelog/next_release/274.feature.rst diff --git a/docs/changelog/next_release/274.feature.rst b/docs/changelog/next_release/274.feature.rst new file mode 100644 index 000000000..c622e1392 --- /dev/null +++ b/docs/changelog/next_release/274.feature.rst @@ -0,0 +1,37 @@ +Divide general ``JDBCOptions`` into ``FetchOptions`` for fetching data and ``ExecuteOptions`` for executing statements. + +Before: + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.JDBCOptions(fetchsize=1000, query_timeout=30), + ) + + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.JDBCOptions(query_timeout=30), + ) + +After: + +.. code-block:: python + + from onetl.connection import Postgres + + # Using FetchOptions for fetching data + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.FetchOptions(fetchsize=1000), + ) + + # Using ExecuteOptions for executing statements + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.ExecuteOptions(query_timeout=30), + ) diff --git a/docs/conf.py b/docs/conf.py index dc4e425b7..25b32f4b0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -54,7 +54,6 @@ "sphinx_copybutton", "sphinx.ext.autodoc", "sphinx.ext.autosummary", - "sphinxcontrib.autodoc_pydantic", "sphinxcontrib.towncrier", # provides `towncrier-draft-entries` directive "sphinxcontrib.plantuml", "sphinx.ext.extlinks", diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index 11e01c753..03e5d5df1 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -17,10 +17,10 @@ There are 2 ways to execute some statement in Clickhouse Use ``Clickhouse.fetch`` ~~~~~~~~~~~~~~~~~~~~~~~~ -Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading Clickhouse config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -50,7 +50,7 @@ Examples df = clickhouse.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Clickhouse.JDBCOptions(query_timeout=10), + options=Clickhouse.FetchOptions(query_timeout=10), ) clickhouse.close() value = df.collect()[0][0] # get value from first row and first column @@ -60,7 +60,7 @@ Use ``Clickhouse.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -98,7 +98,7 @@ Examples ENGINE = MergeTree() ORDER BY id """, - options=Clickhouse.JDBCOptions(query_timeout=10), + options=Clickhouse.ExecuteOptions(query_timeout=10), ) Notes @@ -113,7 +113,13 @@ Options .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: JDBCFetchOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: JDBCExecuteOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index c0c415369..dcc32171b 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -17,10 +17,10 @@ There are 2 ways to execute some statement in Greenplum Use ``Greenplum.fetch`` ~~~~~~~~~~~~~~~~~~~~~~~ -Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading Greenplum config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -50,7 +50,7 @@ Examples df = greenplum.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Greenplum.JDBCOptions(query_timeout=10), + options=Greenplum.FetchOptions(query_timeout=10), ) greenplum.close() value = df.collect()[0][0] # get value from first row and first column @@ -60,7 +60,7 @@ Use ``Greenplum.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -99,7 +99,7 @@ Examples ) DISTRIBUTED BY id """, - options=Greenplum.JDBCOptions(query_timeout=10), + options=Greenplum.ExecuteOptions(query_timeout=10), ) Interaction schema @@ -145,7 +145,13 @@ Options .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: JDBCFetchOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: JDBCExecuteOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/mssql/execute.rst b/docs/connection/db_connection/mssql/execute.rst index 729c75a30..b8b795a66 100644 --- a/docs/connection/db_connection/mssql/execute.rst +++ b/docs/connection/db_connection/mssql/execute.rst @@ -17,10 +17,10 @@ There are 2 ways to execute some statement in MSSQL Use ``MSSQL.fetch`` ~~~~~~~~~~~~~~~~~~~ -Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading MSSQL config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -49,7 +49,7 @@ Examples df = mssql.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=MSSQL.JDBCOptions(query_timeout=10), + options=MSSQL.FetchOptions(query_timeout=10), ) mssql.close() value = df.collect()[0][0] # get value from first row and first column @@ -59,7 +59,7 @@ Use ``MSSQL.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -97,7 +97,7 @@ Examples value NUMBER ) """, - options=MSSQL.JDBCOptions(query_timeout=10), + options=MSSQL.ExecuteOptions(query_timeout=10), ) Options @@ -105,7 +105,13 @@ Options .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: JDBCFetchOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: JDBCExecuteOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/execute.rst b/docs/connection/db_connection/mysql/execute.rst index 34460edae..de1f17002 100644 --- a/docs/connection/db_connection/mysql/execute.rst +++ b/docs/connection/db_connection/mysql/execute.rst @@ -17,10 +17,10 @@ There are 2 ways to execute some statement in MySQL Use ``MySQL.fetch`` ~~~~~~~~~~~~~~~~~~~ -Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading MySQL config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -50,7 +50,7 @@ Examples df = mysql.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=MySQL.JDBCOptions(query_timeout=10), + options=MySQL.FetchOptions(query_timeout=10), ) mysql.close() value = df.collect()[0][0] # get value from first row and first column @@ -60,7 +60,7 @@ Use ``MySQL.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -98,7 +98,7 @@ Examples ) ENGINE = InnoDB """, - options=MySQL.JDBCOptions(query_timeout=10), + options=MySQL.ExecuteOptions(query_timeout=10), ) Options @@ -106,7 +106,13 @@ Options .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: JDBCFetchOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: JDBCExecuteOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/execute.rst b/docs/connection/db_connection/oracle/execute.rst index b8533b278..f43eb54be 100644 --- a/docs/connection/db_connection/oracle/execute.rst +++ b/docs/connection/db_connection/oracle/execute.rst @@ -20,7 +20,7 @@ Use ``Oracle.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Oracle config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -50,7 +50,7 @@ Examples df = oracle.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Oracle.JDBCOptions(query_timeout=10), + options=Oracle.FetchOptions(query_timeout=10), ) oracle.close() value = df.collect()[0][0] # get value from first row and first column @@ -60,7 +60,7 @@ Use ``Oracle.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -98,7 +98,7 @@ Examples value NUMBER ) """, - options=Oracle.JDBCOptions(query_timeout=10), + options=Oracle.ExecuteOptions(query_timeout=10), ) Options @@ -106,7 +106,13 @@ Options .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: JDBCFetchOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: JDBCExecuteOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/execute.rst b/docs/connection/db_connection/postgres/execute.rst index 8c966c110..753c4f624 100644 --- a/docs/connection/db_connection/postgres/execute.rst +++ b/docs/connection/db_connection/postgres/execute.rst @@ -20,7 +20,7 @@ Use ``Postgres.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Postgres config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -48,7 +48,7 @@ Examples df = postgres.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Postgres.JDBCOptions(query_timeout=10), + options=Postgres.FetchOptions(query_timeout=10), ) postgres.close() value = df.collect()[0][0] # get value from first row and first column @@ -58,7 +58,7 @@ Use ``Postgres.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -96,7 +96,7 @@ Examples value real ) """, - options=Postgres.JDBCOptions(query_timeout=10), + options=Postgres.ExecuteOptions(query_timeout=10), ) Options @@ -104,7 +104,13 @@ Options .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: JDBCFetchOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: JDBCExecuteOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/execute.rst b/docs/connection/db_connection/teradata/execute.rst index 545b473d8..3d48d0b9a 100644 --- a/docs/connection/db_connection/teradata/execute.rst +++ b/docs/connection/db_connection/teradata/execute.rst @@ -20,7 +20,7 @@ Use ``Teradata.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Teradata config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -45,7 +45,7 @@ Examples df = teradata.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Teradata.JDBCOptions(query_timeout=10), + options=Teradata.FetchOptions(query_timeout=10), ) teradata.close() value = df.collect()[0][0] # get value from first row and first column @@ -55,7 +55,7 @@ Use ``Teradata.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -95,7 +95,7 @@ Examples ) NO PRIMARY INDEX """, - options=Teradata.JDBCOptions(query_timeout=10), + options=Teradata.ExecuteOptions(query_timeout=10), ) Options @@ -103,7 +103,13 @@ Options .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: JDBCFetchOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: JDBCExecuteOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 120d58008..c9730c44c 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -31,7 +31,11 @@ GreenplumWriteOptions, ) from onetl.connection.db_connection.jdbc_mixin import JDBCMixin -from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, + JDBCOptions, +) from onetl.exception import MISSING_JVM_CLASS_MSG, TooManyParallelJobsError from onetl.hooks import slot, support_hooks from onetl.hwm import Window @@ -420,7 +424,7 @@ def _connector_params( **extra, } - def _options_to_connection_properties(self, options: JDBCOptions): + def _options_to_connection_properties(self, options: JDBCOptions | JDBCExecuteOptions | JDBCFetchOptions): # See https://github.com/pgjdbc/pgjdbc/pull/1252 # Since 42.2.9 Postgres JDBC Driver added new option readOnlyMode=transaction # Which is not a desired behavior, because `.fetch()` method should always be read-only @@ -439,7 +443,7 @@ def _get_server_setting(self, name: str) -> Any: log.debug("|%s| Executing SQL query (on driver):") log_lines(log, query, level=logging.DEBUG) - df = self._query_on_driver(query, self.JDBCOptions()) + df = self._query_on_driver(query, self.FetchOptions()) result = df.collect() log.debug( @@ -460,7 +464,7 @@ def _get_occupied_connections_count(self) -> int: log.debug("|%s| Executing SQL query (on driver):") log_lines(log, query, level=logging.DEBUG) - df = self._query_on_driver(query, self.JDBCOptions()) + df = self._query_on_driver(query, self.FetchOptions()) result = df.collect() log.debug( diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index dae2242b5..23d1ced8b 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -18,6 +18,10 @@ from onetl._util.java import get_java_gateway, try_import_java_class from onetl._util.spark import get_spark_version from onetl._util.version import Version +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) from onetl.connection.db_connection.jdbc_mixin.options import ( JDBCOptions as JDBCMixinOptions, ) @@ -64,6 +68,8 @@ class JDBCMixin(FrozenModel): password: SecretStr JDBCOptions = JDBCMixinOptions + FetchOptions = JDBCFetchOptions + ExecuteOptions = JDBCExecuteOptions DRIVER: ClassVar[str] _CHECK_QUERY: ClassVar[str] = "SELECT 1" @@ -141,7 +147,7 @@ def check(self): log_lines(log, self._CHECK_QUERY, level=logging.DEBUG) try: - self._query_optional_on_driver(self._CHECK_QUERY, self.JDBCOptions(fetchsize=1)) # type: ignore + self._query_optional_on_driver(self._CHECK_QUERY, self.FetchOptions(fetchsize=1)) log.info("|%s| Connection is available.", self.__class__.__name__) except Exception as e: log.exception("|%s| Connection is unavailable", self.__class__.__name__) @@ -153,7 +159,7 @@ def check(self): def fetch( self, query: str, - options: JDBCMixinOptions | dict | None = None, + options: JDBCFetchOptions | dict | None = None, ) -> DataFrame: """ **Immediately** execute SELECT statement **on Spark driver** and return in-memory DataFrame. |support_hooks| @@ -175,7 +181,7 @@ def fetch( SQL query to be executed. - options : dict, :obj:`~JDBCOptions`, default: ``None`` + options : dict, :obj:`~FetchOptions`, default: ``None`` Options to be passed directly to JDBC driver, like ``fetchsize`` or ``queryTimeout`` @@ -195,7 +201,14 @@ def fetch( log.info("|%s| Executing SQL query (on driver):", self.__class__.__name__) log_lines(log, query) - df = self._query_on_driver(query, self.JDBCOptions.parse(options)) + df = self._query_on_driver( + query, + ( + self.FetchOptions.parse(options.dict()) # type: ignore + if isinstance(options, JDBCMixinOptions) + else self.FetchOptions.parse(options) + ), + ) log.info( "|%s| Query succeeded, resulting in-memory dataframe contains %d rows", @@ -208,7 +221,7 @@ def fetch( def execute( self, statement: str, - options: JDBCMixinOptions | dict | None = None, + options: JDBCExecuteOptions | JDBCMixinOptions | dict | None = None, ) -> DataFrame | None: """ **Immediately** execute DDL, DML or procedure/function **on Spark driver**. |support_hooks| @@ -227,7 +240,7 @@ def execute( Statement to be executed. - options : dict, :obj:`~JDBCOptions`, default: ``None`` + options : dict, :obj:`~JDBCExecuteOptions`, default: ``None`` Options to be passed directly to JDBC driver, like ``queryTimeout`` @@ -250,7 +263,11 @@ def execute( log.info("|%s| Executing statement (on driver):", self.__class__.__name__) log_lines(log, statement) - call_options = self.JDBCOptions.parse(options) + call_options = ( + self.ExecuteOptions.parse(options.dict()) + if isinstance(options, JDBCMixinOptions) + else self.ExecuteOptions.parse(options) + ) df = self._call_on_driver(statement, call_options) if df is not None: @@ -282,7 +299,7 @@ def _check_java_class_imported(cls, spark): def _query_on_driver( self, query: str, - options: JDBCMixinOptions, + options: JDBCMixinOptions | JDBCFetchOptions | JDBCExecuteOptions, ) -> DataFrame: return self._execute_on_driver( statement=query, @@ -295,7 +312,7 @@ def _query_on_driver( def _query_optional_on_driver( self, query: str, - options: JDBCMixinOptions, + options: JDBCMixinOptions | JDBCFetchOptions, ) -> DataFrame | None: return self._execute_on_driver( statement=query, @@ -308,7 +325,7 @@ def _query_optional_on_driver( def _call_on_driver( self, query: str, - options: JDBCMixinOptions, + options: JDBCMixinOptions | JDBCExecuteOptions, ) -> DataFrame | None: return self._execute_on_driver( statement=query, @@ -320,7 +337,7 @@ def _call_on_driver( def _get_jdbc_properties( self, - options: JDBCMixinOptions, + options: JDBCFetchOptions | JDBCExecuteOptions | JDBCMixinOptions, **kwargs, ) -> dict[str, str]: """ @@ -330,7 +347,7 @@ def _get_jdbc_properties( result.update(options.dict(by_alias=True, **kwargs)) return stringify(result) - def _options_to_connection_properties(self, options: JDBCMixinOptions): + def _options_to_connection_properties(self, options: JDBCFetchOptions | JDBCExecuteOptions | JDBCMixinOptions): """ Converts human-readable Options class to ``java.util.Properties``. @@ -351,7 +368,7 @@ def _options_to_connection_properties(self, options: JDBCMixinOptions): ) return jdbc_options.asConnectionProperties() - def _get_jdbc_connection(self, options: JDBCMixinOptions): + def _get_jdbc_connection(self, options: JDBCFetchOptions | JDBCExecuteOptions | JDBCMixinOptions): if not self._last_connection_and_options: # connection class can be used in multiple threads. # each Python thread creates its own thread in JVM @@ -393,7 +410,7 @@ def _execute_on_driver( statement: str, statement_type: JDBCStatementType, callback: Callable[..., T], - options: JDBCMixinOptions, + options: JDBCFetchOptions | JDBCExecuteOptions | JDBCMixinOptions, read_only: bool, ) -> T: """ @@ -415,7 +432,7 @@ def _execute_statement( self, jdbc_statement, statement: str, - options: JDBCMixinOptions, + options: JDBCMixinOptions | JDBCFetchOptions | JDBCExecuteOptions, callback: Callable[..., T], read_only: bool, ) -> T: diff --git a/onetl/connection/db_connection/jdbc_mixin/options.py b/onetl/connection/db_connection/jdbc_mixin/options.py index 349630719..2fe94c8bb 100644 --- a/onetl/connection/db_connection/jdbc_mixin/options.py +++ b/onetl/connection/db_connection/jdbc_mixin/options.py @@ -4,6 +4,8 @@ from typing import Optional +from typing_extensions import deprecated + try: from pydantic.v1 import Field except (ImportError, AttributeError): @@ -22,6 +24,7 @@ ) +@deprecated("Deprecated in 0.11.0 and will be removed in 1.0.0. Use FetchOptions or ExecuteOptions instead") class JDBCOptions(GenericOptions): """Generic options, related to specific JDBC driver. @@ -54,3 +57,69 @@ class Config: Default value depends on driver. For example, Oracle has default ``fetchsize=10``. """ + + +class JDBCFetchOptions(GenericOptions): + """Options related to fetching data from databases via JDBC. + + .. note :: + + You can pass any value + supported by underlying JDBC driver class, + even if it is not mentioned in this documentation. + """ + + class Config: + prohibited_options = PROHIBITED_OPTIONS + extra = "allow" + + query_timeout: Optional[int] = Field(default=None, alias="queryTimeout") + """The number of seconds the driver will wait for a statement to execute. + Zero means there is no limit. + + This option depends on driver implementation, + some drivers can check the timeout of each query instead of an entire JDBC batch. + """ + + fetchsize: Optional[int] = None + """How many rows to fetch per round trip. + + Tuning this option can influence performance of reading. + + .. warning:: + Default value depends on driver. For example, Oracle has + default ``fetchsize=10``. + """ + + +class JDBCExecuteOptions(GenericOptions): + """Options related to executing statements in databases via JDBC. + + .. note :: + + You can pass any value + supported by underlying JDBC driver class, + even if it is not mentioned in this documentation. + """ + + class Config: + prohibited_options = PROHIBITED_OPTIONS + extra = "allow" + + query_timeout: Optional[int] = Field(default=None, alias="queryTimeout") + """The number of seconds the driver will wait for a statement to execute. + Zero means there is no limit. + + This option depends on driver implementation, + some drivers can check the timeout of each query instead of an entire JDBC batch. + """ + + fetchsize: Optional[int] = None + """How many rows to fetch per round trip. + + Tuning this option can influence performance of reading. + + .. warning:: + Default value depends on driver. For example, Oracle has + default ``fetchsize=10``. + """ diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index d566fa275..c21618381 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -22,7 +22,10 @@ from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_connection.options import JDBCReadOptions -from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCOptions, +) from onetl.connection.db_connection.oracle.dialect import OracleDialect from onetl.hooks import slot, support_hooks from onetl.hwm import Window @@ -272,14 +275,14 @@ def get_min_max_values( def execute( self, statement: str, - options: JDBCOptions | dict | None = None, # noqa: WPS437 + options: JDBCOptions | JDBCExecuteOptions | dict | None = None, # noqa: WPS437 ) -> DataFrame | None: statement = clear_statement(statement) log.info("|%s| Executing statement (on driver):", self.__class__.__name__) log_lines(log, statement) - call_options = self.JDBCOptions.parse(options) + call_options = self.ExecuteOptions.parse(options) df = self._call_on_driver(statement, call_options) self._handle_compile_errors(statement.strip(), call_options) @@ -336,7 +339,7 @@ def _get_compile_errors( type_name: str, schema: str, object_name: str, - options: JDBCOptions, + options: JDBCOptions | JDBCExecuteOptions, ) -> list[tuple[ErrorPosition, str]]: """ Get compile errors for the object. @@ -406,7 +409,7 @@ def _build_error_message(self, aggregated_errors: OrderedDict[ErrorPosition, str def _handle_compile_errors( self, statement: str, - options: JDBCOptions, + options: JDBCExecuteOptions, ) -> None: """ Oracle does not return compilation errors immediately. diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 16d317fee..2b67d43ec 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -8,7 +8,11 @@ from onetl._util.classproperty import classproperty from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection -from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, + JDBCOptions, +) from onetl.connection.db_connection.postgres.dialect import PostgresDialect from onetl.hooks import slot, support_hooks from onetl.impl import GenericOptions @@ -159,7 +163,10 @@ def jdbc_params(self) -> dict[str, str]: def instance_url(self) -> str: return f"{super().instance_url}/{self.database}" - def _options_to_connection_properties(self, options: JDBCOptions): # noqa: WPS437 + def _options_to_connection_properties( + self, + options: JDBCOptions | JDBCFetchOptions | JDBCExecuteOptions, + ): # noqa: WPS437 # See https://github.com/pgjdbc/pgjdbc/pull/1252 # Since 42.2.9 Postgres JDBC Driver added new option readOnlyMode=transaction # Which is not a desired behavior, because `.fetch()` method should always be read-only diff --git a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py index 5c2d17115..6775244ee 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py @@ -77,7 +77,7 @@ def test_greenplum_connection_fetch(spark, processing, load_table_data, suffix): table = load_table_data.full_name - df = greenplum.fetch(f"SELECT * FROM {table}{suffix}", Greenplum.JDBCOptions(fetchsize=2)) + df = greenplum.fetch(f"SELECT * FROM {table}{suffix}", Greenplum.FetchOptions(fetchsize=2)) table_df = processing.get_expected_dataframe( schema=load_table_data.schema, table=load_table_data.table, @@ -108,7 +108,10 @@ def test_greenplum_connection_ddl(spark, processing, get_schema_table, suffix): table_name, schema, table = get_schema_table fields = {column_name: processing.get_column_type(column_name) for column_name in processing.column_names} - assert not greenplum.execute(f"SET search_path TO {schema}, public{suffix}", Greenplum.JDBCOptions(queryTimeout=1)) + assert not greenplum.execute( + f"SET search_path TO {schema}, public{suffix}", + Greenplum.ExecuteOptions(queryTimeout=1), + ) assert not greenplum.execute(processing.create_schema_ddl(schema) + suffix) assert not greenplum.execute(processing.create_table_ddl(table, fields, schema) + suffix) diff --git a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py index 526fe1ee6..de2f9e522 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py @@ -91,7 +91,7 @@ def test_postgres_connection_fetch(spark, processing, load_table_data, suffix): table = load_table_data.full_name - df = postgres.fetch(f"SELECT * FROM {table}{suffix}", Postgres.JDBCOptions(fetchsize=2)) + df = postgres.fetch(f"SELECT * FROM {table}{suffix}", Postgres.FetchOptions(fetchsize=2)) table_df = processing.get_expected_dataframe( schema=load_table_data.schema, table=load_table_data.table, @@ -122,7 +122,7 @@ def test_postgres_connection_ddl(spark, processing, get_schema_table, suffix): table_name, schema, table = get_schema_table fields = {column_name: processing.get_column_type(column_name) for column_name in processing.column_names} - assert not postgres.execute(f"SET search_path TO {schema}, public{suffix}", Postgres.JDBCOptions(queryTimeout=1)) + assert not postgres.execute(f"SET search_path TO {schema}, public{suffix}", Postgres.ExecuteOptions(queryTimeout=1)) assert not postgres.execute(processing.create_schema_ddl(schema) + suffix) assert not postgres.execute(processing.create_table_ddl(table, fields, schema) + suffix) @@ -1005,3 +1005,33 @@ def test_postgres_connection_sql_options( ) processing.assert_equal_df(df=df, other_frame=table_df) + + +def test_postgres_fetch_with_legacy_jdbc_options(spark, processing): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + options = Postgres.JDBCOptions(fetchsize=10) + + df = postgres.fetch("SELECT CURRENT_TIMESTAMP;", options=options) + assert df is not None + + +def test_postgres_execute_with_legacy_jdbc_options(spark, processing): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + options = Postgres.JDBCOptions(query_timeout=30) + postgres.execute("DROP TABLE IF EXISTS temp_table;", options=options) diff --git a/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py index 8e51d0a89..597dbf0c0 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py @@ -13,7 +13,8 @@ [ Postgres.ReadOptions, Postgres.WriteOptions, - Postgres.JDBCOptions, + Postgres.FetchOptions, + Postgres.ExecuteOptions, Postgres.Options, Greenplum.ReadOptions, Greenplum.WriteOptions, @@ -107,7 +108,8 @@ def test_db_options_parse_mismatch_connection_and_options_types(connection, opti [ Postgres.ReadOptions, Postgres.WriteOptions, - Postgres.JDBCOptions, + Postgres.FetchOptions, + Postgres.ExecuteOptions, Greenplum.ReadOptions, Greenplum.WriteOptions, Hive.WriteOptions, diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index 5c824d127..55c3c942e 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -233,7 +233,8 @@ def test_greenplum_write_options_default(): [ (Greenplum.ReadOptions, "GreenplumReadOptions"), (Greenplum.WriteOptions, "GreenplumWriteOptions"), - (Greenplum.JDBCOptions, "JDBCOptions"), + (Greenplum.FetchOptions, "JDBCFetchOptions"), + (Greenplum.ExecuteOptions, "JDBCExecuteOptions"), (Greenplum.Extra, "GreenplumExtra"), ], ) @@ -243,7 +244,8 @@ def test_greenplum_jdbc_options_populated_by_connection_class(klass, name): klass(user="me", password="abc", driver="some.Class", url="jdbc:postgres://some/db") -def test_greenplum_read_write_options_populated_by_connection_class(): +@pytest.mark.parametrize("options_class", [Greenplum.FetchOptions, Greenplum.ExecuteOptions]) +def test_greenplum_read_write_options_populated_by_connection_class(options_class): error_msg = r"Options \['dbschema', 'dbtable'\] are not allowed to use in a GreenplumReadOptions" with pytest.raises(ValueError, match=error_msg): Greenplum.ReadOptions(dbschema="myschema", dbtable="mytable") @@ -252,8 +254,8 @@ def test_greenplum_read_write_options_populated_by_connection_class(): with pytest.raises(ValueError, match=error_msg): Greenplum.WriteOptions(dbschema="myschema", dbtable="mytable") - # JDBCOptions does not have such restriction - options = Greenplum.JDBCOptions(dbschema="myschema", dbtable="mytable") + # FetchOptions & ExecuteOptions does not have such restriction + options = options_class(dbschema="myschema", dbtable="mytable") assert options.dbschema == "myschema" assert options.dbtable == "mytable" diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index e793966ae..90c0d1903 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -44,7 +44,8 @@ def test_jdbc_options_default(): ("properties", {"abc": "cde"}), ], ) -def test_jdbc_read_write_options_populated_by_connection_class(arg, value): +@pytest.mark.parametrize("options_class", [Postgres.FetchOptions, Postgres.ExecuteOptions]) +def test_jdbc_read_write_options_populated_by_connection_class(arg, value, options_class): error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCReadOptions" with pytest.raises(ValueError, match=error_msg): Postgres.ReadOptions.parse({arg: value}) @@ -53,8 +54,8 @@ def test_jdbc_read_write_options_populated_by_connection_class(arg, value): with pytest.raises(ValueError, match=error_msg): Postgres.WriteOptions.parse({arg: value}) - # JDBCOptions does not have such restriction - options = Postgres.JDBCOptions.parse({arg: value}) + # FetchOptions & ExecuteOptions does not have such restriction + options = options_class.parse({arg: value}) assert options.dict()[arg] == value @@ -285,3 +286,10 @@ def test_jdbc_sql_options_default(): options = Postgres.SQLOptions() assert options.fetchsize == 100_000 assert options.query_timeout is None + + +def test_jdbc_deprecated_jdbcoptions(): + deprecated_warning = "Deprecated in 0.11.0 and will be removed in 1.0.0. Use FetchOptions or ExecuteOptions instead" + + with pytest.warns(DeprecationWarning, match=deprecated_warning): + Postgres.JDBCOptions(fetchsize=10, query_timeout=30) From aa2b753ec075ccabce77e705b0c7552cfadefb7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 17 May 2024 11:14:50 +0000 Subject: [PATCH 54/71] [DOP-14061] Replace asserts in documentation with doctest style --- .github/workflows/test-core.yml | 2 +- .pre-commit-config.yaml | 2 +- .../next_release/273.improvement.rst | 1 + docs/conf.py | 6 + onetl/_internal.py | 88 ++- onetl/_util/classproperty.py | 3 +- onetl/base/base_file_connection.py | 199 ++++--- onetl/base/base_file_filter.py | 15 +- onetl/base/base_file_limit.py | 61 +- .../db_connection/jdbc_mixin/connection.py | 1 - .../file_connection/file_connection.py | 89 ++- .../mixins/rename_dir_mixin.py | 12 +- onetl/file/file_downloader/file_downloader.py | 150 +++-- onetl/file/file_downloader/result.py | 55 +- onetl/file/file_mover/file_mover.py | 150 +++-- onetl/file/file_mover/result.py | 55 +- onetl/file/file_result.py | 540 ++++++++---------- onetl/file/file_set.py | 128 ++--- onetl/file/file_uploader/file_uploader.py | 161 +++--- onetl/file/file_uploader/result.py | 55 +- onetl/file/filter/match_all_filters.py | 19 +- onetl/file/limit/limits_reached.py | 23 +- onetl/file/limit/limits_stop_at.py | 16 +- onetl/file/limit/reset_limits.py | 26 +- onetl/hooks/hook.py | 102 ++-- onetl/hooks/hook_collection.py | 255 ++++----- onetl/hooks/method_inheritance_stack.py | 46 +- onetl/hwm/store/hwm_class_registry.py | 45 +- onetl/strategy/incremental_strategy.py | 47 +- onetl/strategy/snapshot_strategy.py | 11 +- setup.cfg | 4 +- .../test_postgres_integration.py | 30 +- 32 files changed, 1176 insertions(+), 1221 deletions(-) create mode 100644 docs/changelog/next_release/273.improvement.rst diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index b7a1b3cec..65d681dc1 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -72,7 +72,7 @@ jobs: - name: Run tests run: | ./run_tests.sh -m 'not connection' - ./run_tests.sh onetl/_util + ./run_tests.sh onetl/_util onetl/_internal.py onetl/hooks onetl/file/filter onetl/file/limit onetl/hwm/store/hwm_class_registry.py - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0745931b..aa7bde988 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ default_language_version: - python: python3.12 + python: python3.11 repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/docs/changelog/next_release/273.improvement.rst b/docs/changelog/next_release/273.improvement.rst new file mode 100644 index 000000000..1e9650b4c --- /dev/null +++ b/docs/changelog/next_release/273.improvement.rst @@ -0,0 +1 @@ +Replace all ``assert`` in documentation with doctest syntax. This should make documentation more readable. diff --git a/docs/conf.py b/docs/conf.py index 25b32f4b0..d6b1839e2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -69,6 +69,12 @@ autodoc_pydantic_field_list_validators = False sphinx_tabs_disable_tab_closing = True +# prevent >>>, ... and doctest outputs from copying +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_is_regexp = True +copybutton_copy_empty_lines = False +copybutton_only_copy_prompt_lines = True + towncrier_draft_autoversion_mode = "draft" towncrier_draft_include_empty = False towncrier_draft_working_directory = PROJECT_ROOT_DIR diff --git a/onetl/_internal.py b/onetl/_internal.py index 298ecdf95..361bb3e82 100644 --- a/onetl/_internal.py +++ b/onetl/_internal.py @@ -10,8 +10,6 @@ from datetime import datetime from typing import TYPE_CHECKING, Any -from etl_entities.process import ProcessStackManager - try: from pydantic.v1 import SecretStr except (ImportError, AttributeError): @@ -33,19 +31,18 @@ def clear_statement(statement: str) -> str: Examples -------- - .. code:: python - - assert clear_statement("SELECT * FROM mytable") == "SELECT * FROM mytable" - assert clear_statement("SELECT * FROM mytable ; ") == "SELECT * FROM mytable" - assert ( - clear_statement("CREATE TABLE mytable (id NUMBER)") - == "CREATE TABLE mytable (id NUMBER)" - ) - assert clear_statement("BEGIN ... END") == "BEGIN ... END;" + >>> clear_statement("SELECT * FROM mytable") + 'SELECT * FROM mytable' + >>> clear_statement("SELECT * FROM mytable ; ") + 'SELECT * FROM mytable' + >>> clear_statement("CREATE TABLE mytable (id NUMBER)") + 'CREATE TABLE mytable (id NUMBER)' + >>> clear_statement("BEGIN ... END") + 'BEGIN ... END;' """ - statement = statement.rstrip().lstrip("\n\r").rstrip(";") - if statement.lower().strip().endswith("end"): + statement = statement.rstrip().lstrip("\n\r").rstrip(";").rstrip() + if statement.lower().endswith("end"): statement += ";" return statement @@ -57,11 +54,12 @@ def uniq_ignore_case(orig_list: list[str]) -> list[str]: Examples -------- - .. code:: python - - assert uniq_ignore_case(["a", "c"]) == ["a", "c"] - assert uniq_ignore_case(["A", "a", "c"]) == ["A", "c"] - assert uniq_ignore_case(["a", "A", "c"]) == ["a", "c"] + >>> uniq_ignore_case(["a", "c"]) + ['a', 'c'] + >>> uniq_ignore_case(["A", "a", "c"]) + ['A', 'c'] + >>> uniq_ignore_case(["a", "A", "c"]) + ['a', 'c'] """ result: list[str] = [] @@ -90,14 +88,22 @@ def stringify(value: Any, quote: bool = False) -> Any: # noqa: WPS212 Examples -------- - >>> assert stringify(1) == "1" - >>> assert stringify(True) == "true" - >>> assert stringify(False) == "false" - >>> assert stringify(None) == "null" - >>> assert stringify("string") == "string" - >>> assert stringify("string", quote=True) == '"string"' - >>> assert stringify({"abc": 1}) == {"abc": "1"} - >>> assert stringify([1, True, False, None, "string"]) == ["1", "true", "false", "null", "string"] + >>> stringify(1) + '1' + >>> stringify(True) + 'true' + >>> stringify(False) + 'false' + >>> stringify(None) + 'null' + >>> stringify("string") + 'string' + >>> stringify("string", quote=True) + '"string"' + >>> stringify({"abc": 1}) + {'abc': '1'} + >>> stringify([1, True, False, None, "string"]) + ['1', 'true', 'false', 'null', 'string'] """ if isinstance(value, dict): @@ -131,9 +137,8 @@ def to_camel(string: str) -> str: Examples -------- - .. code:: python - - assert to_camel("some_value") == "someValue" + >>> to_camel("some_value") + 'someValue' """ return "".join(word.capitalize() if index > 0 else word for index, word in enumerate(string.split("_"))) @@ -151,24 +156,17 @@ def generate_temp_path(root: PurePath) -> PurePath: Examples -------- - View files - - .. code:: python - - from etl_entities.process import Process - - from pathlib import Path - - assert generate_temp_path(Path("/tmp")) == Path( - "/tmp/onetl/currenthost/myprocess/20230524122150", - ) - - with Process(dag="mydag", task="mytask"): - assert generate_temp_path(Path("/abc")) == Path( - "/abc/onetl/currenthost/mydag.mytask.myprocess/20230524122150", - ) + >>> from etl_entities.process import Process + >>> from pathlib import Path + >>> generate_temp_path(Path("/tmp")) # doctest: +SKIP + Path("/tmp/onetl/currenthost/myprocess/20230524122150") + >>> with Process(dag="mydag", task="mytask"): # doctest: +SKIP + ... generate_temp_path(Path("/abc")) + Path("/abc/onetl/currenthost/mydag.mytask.myprocess/20230524122150") """ + from etl_entities.process import ProcessStackManager + current_process = ProcessStackManager.get_current() current_dt = datetime.now().strftime(DATETIME_FORMAT) return root / "onetl" / current_process.host / current_process.full_name / current_dt diff --git a/onetl/_util/classproperty.py b/onetl/_util/classproperty.py index 8738304dc..e971638ab 100644 --- a/onetl/_util/classproperty.py +++ b/onetl/_util/classproperty.py @@ -15,7 +15,8 @@ class classproperty(property): # noqa: N801 ... def attribute(cls): ... return 123 >>> # no call - >>> assert My.attribute == 123 + >>> My.attribute + 123 """ def __init__(self, f): diff --git a/onetl/base/base_file_connection.py b/onetl/base/base_file_connection.py index 3d1aa2357..823386d55 100644 --- a/onetl/base/base_file_connection.py +++ b/onetl/base/base_file_connection.py @@ -35,11 +35,12 @@ def path_exists(self, path: os.PathLike | str) -> bool: Examples -------- - .. code:: python - - assert connection.path_exists("/path/to/file.csv") - assert connection.path_exists("/path/to/dir") - assert not connection.path_exists("/path/to/missing") + >>> connection.path_exists("/path/to/file.csv") + True + >>> connection.path_exists("/path/to/dir") + True + >>> connection.path_exists("/path/to/missing") + False """ @abstractmethod @@ -64,10 +65,10 @@ def is_file(self, path: os.PathLike | str) -> bool: Examples -------- - .. code:: python - - assert connection.is_file("/path/to/dir/file.csv") - assert not connection.is_file("/path/to/dir") + >>> connection.is_file("/path/to/dir/file.csv") + True + >>> connection.is_file("/path/to/dir") + False """ @abstractmethod @@ -92,10 +93,10 @@ def is_dir(self, path: os.PathLike | str) -> bool: Examples -------- - .. code:: python - - assert connection.is_dir("/path/to/dir") - assert not connection.is_dir("/path/to/dir/file.csv") + >>> connection.is_dir("/path/to/dir") + True + >>> connection.is_dir("/path/to/dir/file.csv") + False """ @abstractmethod @@ -119,11 +120,11 @@ def get_stat(self, path: os.PathLike | str) -> PathStatProtocol: Examples -------- - .. code:: python - - stat = connection.get_stat("/path/to/file.csv") - assert stat.st_size > 0 - assert stat.st_uid == 12345 # owner id + >>> stat = connection.get_stat("/path/to/file.csv") + >>> stat.st_size # in bytes + 1024 + >>> stat.st_uid # owner id or name + 12345 """ @abstractmethod @@ -151,11 +152,11 @@ def resolve_dir(self, path: os.PathLike | str) -> PathWithStatsProtocol: Examples -------- - .. code:: python - - dir_path = connection.resolve_dir("/path/to/dir") - assert os.fspath(dir_path) == "/path/to/dir" - assert dir_path.stat.st_uid == 12345 # owner id + >>> dir_path = connection.resolve_dir("/path/to/dir") + >>> os.fspath(dir_path) + '/path/to/dir' + >>> dir_path.stat.st_uid # owner id + 12345 """ @abstractmethod @@ -183,11 +184,11 @@ def resolve_file(self, path: os.PathLike | str) -> PathWithStatsProtocol: Examples -------- - .. code:: python - - file_path = connection.resolve_file("/path/to/dir/file.csv") - assert os.fspath(file_path) == "/path/to/dir/file.csv" - assert file_path.stat.st_uid == 12345 # owner id + >>> file_path = connection.resolve_file("/path/to/dir/file.csv") + >>> os.fspath(file_path) + '/path/to/dir/file.csv' + >>> file_path.stat.st_uid # owner id + 12345 """ @abstractmethod @@ -212,9 +213,9 @@ def create_dir(self, path: os.PathLike | str) -> PathWithStatsProtocol: Examples -------- - .. code:: python - - dir_path = connection.create_dir("/path/to/dir") + >>> dir_path = connection.create_dir("/path/to/dir") + >>> os.fspath(dir_path) + '/path/to/dir' """ @abstractmethod @@ -245,12 +246,12 @@ def remove_file(self, path: os.PathLike | str) -> bool: Examples -------- - .. code:: python - - assert connection.remove_file("/path/to/file.csv") - assert not connection.path_exists("/path/to/dir/file.csv") - - assert not connection.remove_file("/path/to/file.csv") # already deleted + >>> connection.remove_file("/path/to/file.csv") + True + >>> connection.path_exists("/path/to/dir/file.csv") + False + >>> connection.remove_file("/path/to/file.csv") # already deleted, no error + False """ @abstractmethod @@ -280,13 +281,14 @@ def remove_dir(self, path: os.PathLike | str, recursive: bool = False) -> bool: Examples -------- - .. code:: python - - assert connection.remove_dir("/path/to/dir") - assert not connection.path_exists("/path/to/dir/file.csv") - assert not connection.path_exists("/path/to/dir") - - assert not connection.remove_dir("/path/to/dir") # already deleted + >>> connection.remove_dir("/path/to/dir") + True + >>> connection.path_exists("/path/to/dir") + False + >>> connection.path_exists("/path/to/dir/file.csv") + False + >>> connection.remove_dir("/path/to/dir") # already deleted, no error + False """ @abstractmethod @@ -332,11 +334,13 @@ def rename_file( Examples -------- - .. code:: python - - new_file = connection.rename_file("/path/to/file1.csv", "/path/to/file2.csv") - assert connection.path_exists("/path/to/file2.csv") - assert not connection.path_exists("/path/to/file1.csv") + >>> new_file = connection.rename_file("/path/to/file1.csv", "/path/to/file2.csv") + >>> os.fspath(new_file) + '/path/to/file2.csv' + >>> connection.path_exists("/path/to/file2.csv") + True + >>> connection.path_exists("/path/to/file1.csv") + False """ @abstractmethod @@ -376,11 +380,11 @@ def list_dir( Examples -------- - .. code:: python - - dir_content = connection.list_dir("/path/to/dir") - assert os.fspath(dir_content[0]) == "/path/to/dir/file.csv" - assert connection.path_exists("/path/to/dir/file.csv") + >>> dir_content = connection.list_dir("/path/to/dir") + >>> os.fspath(dir_content[0]) + '/path/to/dir/file.csv' + >>> connection.path_exists("/path/to/dir/file.csv") + True """ @abstractmethod @@ -428,13 +432,16 @@ def walk( Examples -------- - .. code:: python - - for root, dirs, files in connection.walk("/path/to/dir"): - assert os.fspath(root) == "/path/to/dir" - assert dirs == [] - assert os.fspath(files[0]) == "/path/to/dir/file.csv" - assert connection.path_exists("/path/to/dir/file.csv") + >>> for root, dirs, files in connection.walk("/path/to/dir"): + ... break + >>> os.fspath(root) + '/path/to/dir' + >>> dirs + [] + >>> os.fspath(files[0]) + '/path/to/dir/file.csv' + >>> connection.path_exists("/path/to/dir/file.csv") + True """ @abstractmethod @@ -483,14 +490,18 @@ def download_file( Examples -------- - .. code:: python - - local_file = connection.download_file( - remote_file_path="/path/to/source.csv", local_file_path="/path/to/target.csv" - ) - assert local_file.exists() - assert os.fspath(local_file) == "/path/to/target.csv" - assert local_file.stat().st_size == connection.get_stat("/path/to/source.csv").st_size + >>> local_file = connection.download_file( + ... remote_file_path="/path/to/source.csv", + ... local_file_path="/path/to/target.csv", + ... ) + >>> os.fspath(local_file) + '/path/to/target.csv' + >>> local_file.exists() + True + >>> local_file.stat().st_size # in bytes + 1024 + >>> connection.get_stat("/path/to/source.csv").st_size # same size + 1024 """ @abstractmethod @@ -539,14 +550,18 @@ def upload_file( Examples -------- - .. code:: python - - remote_file = connection.upload( - local_file_path="/path/to/source.csv", - remote_file_path="/path/to/target.csv", - ) - assert connection.path_exists("/path/to/target.csv") - assert remote_file.stat().st_size == os.stat("/path/to/source.csv").st_size + >>> remote_file = connection.upload( + ... local_file_path="/path/to/source.csv", + ... remote_file_path="/path/to/target.csv", + ... ) + >>> os.fspath(remote_file) + '/path/to/target.csv' + >>> connection.path_exists("/path/to/target.csv") + True + >>> remote_file.stat().st_size # in bytes + 1024 + >>> os.stat("/path/to/source.csv").st_size # same as source + 1024 """ @abstractmethod @@ -577,10 +592,8 @@ def read_text(self, path: os.PathLike | str, encoding: str = "utf-8") -> str: Examples -------- - .. code:: python - - content = connection.read_text("/path/to/dir/file.csv") - assert content == "some;header\n1;2" + >>> connection.read_text("/path/to/dir/file.csv") + 'some;header\n1;2' """ @abstractmethod @@ -608,10 +621,8 @@ def read_bytes(self, path: os.PathLike | str) -> bytes: Examples -------- - .. code:: python - - content = connection.read_bytes("/path/to/dir/file.csv") - assert content == b"0xdeadbeef" + >>> connection.read_bytes("/path/to/dir/file.csv") + b'0xdeadbeef' """ @abstractmethod @@ -654,10 +665,11 @@ def write_text( Examples -------- - .. code:: python - - file_path = connection.write_text("/path/to/dir/file.csv", "some;header\n1;2") - assert file_path.stat.st_size > 0 + >>> file_path = connection.write_text("/path/to/dir/file.csv", "some;header\n1;2") + >>> os.fspath(file_path) + '/path/to/dir/file.csv' + >>> file_path.stat.st_size # in bytes + 1024 """ @abstractmethod @@ -692,10 +704,11 @@ def write_bytes(self, path: os.PathLike | str, content: bytes) -> PathWithStatsP Examples -------- - .. code:: python - - file_path = connection.write_bytes("/path/to/dir/file.csv", b"0xdeadbeef") - assert file_path.stat.st_size > 0 + >>> file_path = connection.write_bytes("/path/to/dir/file.csv", b"0xdeadbeef") + >>> os.fspath(file_path) + '/path/to/dir/file.csv' + >>> file_path.stat.st_size # in bytes + 1024 """ @property diff --git a/onetl/base/base_file_filter.py b/onetl/base/base_file_filter.py index 641a5b3ed..5fdad2a9e 100644 --- a/onetl/base/base_file_filter.py +++ b/onetl/base/base_file_filter.py @@ -25,11 +25,12 @@ def match(self, path: PathProtocol) -> bool: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - - assert filter.match(LocalPath("/path/to/file.csv")) - assert not filter.match(LocalPath("/path/to/excluded.csv")) - assert filter.match(LocalPath("/path/to/file.csv")) + from onetl.impl import LocalPath + + >>> filter.match(LocalPath("/path/to/file.csv")) + True + >>> filter.match(LocalPath("/path/to/excluded.csv")) + False + >>> filter.match(LocalPath("/path/to/file.csv")) + True """ diff --git a/onetl/base/base_file_limit.py b/onetl/base/base_file_limit.py index f71f1910c..4793f0874 100644 --- a/onetl/base/base_file_limit.py +++ b/onetl/base/base_file_limit.py @@ -33,12 +33,11 @@ def reset(self) -> Self: Examples -------- - .. code:: python - - assert limit.is_reached - - new_limit = limit.reset() - assert not new_limit.is_reached + >>> limit.is_reached + True + >>> new_limit = limit.reset() + >>> new_limit.is_reached + False """ @abstractmethod @@ -58,21 +57,18 @@ def stops_at(self, path: PathProtocol) -> bool: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - - assert not limit.stops_at(LocalPath("/path/to/file.csv")) - # do this multiple times - ... - - # stopped on some input - assert limit.stops_at(LocalPath("/path/to/another.csv")) - - # at this point, .stops_at() and .is_reached will always return True, - # even on inputs that returned False before. - # it will be in the same state until .reset() is called - assert limit.stops_at(LocalPath("/path/to/file.csv")) + >>> from onetl.impl import LocalPath + >>> # limit is not reached yet + >>> limit.stops_at(LocalPath("/path/to/file.csv")) + False + >>> # after limit is reached + >>> limit.stops_at(LocalPath("/path/to/another.csv")) + True + >>> # at this point, .stops_at() and .is_reached will always return True, + >>> # even on inputs that returned False before. + >>> # it will be in the same state until .reset() is called + >>> limit.stops_at(LocalPath("/path/to/file.csv")) + True """ @property @@ -88,15 +84,16 @@ def is_reached(self) -> bool: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - - assert not limit.is_reached - - assert not limit.stops_at(LocalPath("/path/to/file.csv")) - assert not limit.is_reached - - assert limit.stops_at(LocalPath("/path/to/file.csv")) - assert limit.is_reached + >>> from onetl.impl import LocalPath + >>> limit.is_reached + False + >>> limit.stops_at(LocalPath("/path/to/file.csv")) + False + >>> limit.is_reached + False + >>> # after limit is reached + >>> limit.stops_at(LocalPath("/path/to/file.csv")) + True + >>> limit.is_reached + True """ diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 23d1ced8b..1ab3cb14d 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -113,7 +113,6 @@ def close(self): .. code:: python df = connection.fetch("SELECT * FROM mytable LIMIT 10") - assert df.count() connection.close() # or diff --git a/onetl/connection/file_connection/file_connection.py b/onetl/connection/file_connection/file_connection.py index 3d7437ad8..0a1584095 100644 --- a/onetl/connection/file_connection/file_connection.py +++ b/onetl/connection/file_connection/file_connection.py @@ -87,7 +87,6 @@ def close(self): .. code:: python content = connection.list_dir("/mydir") - assert content connection.close() # or @@ -603,19 +602,19 @@ def _extract_name_from_entry(self, entry) -> str: Get an entry name: - .. code:: python - - for entry in connection._scan_entries(path="/a/path/to/the/directory"): - assert entry == { - "created": "2023-12-08T18:33:39Z", - "owner": None, - "size": "23", - "modified": "2023-12-08 18:33:20", - "isdir": False, - "path": "/path/to/the/file.txt", - } - - assert entry._extract_name_from_entry(entry) == "file.txt" + >>> for entry in connection._scan_entries(path="/a/path/to/the/directory"): + ... break + >>> entry + { + 'created': '2023-12-08T18:33:39Z', + 'owner': None, + 'size': 23, + 'modified': '2023-12-08 18:33:20', + 'isdir': False, + 'path': '/path/to/the/directory/file.txt', + } + >>> entry._extract_name_from_entry(entry) + 'file.txt' """ @abstractmethod @@ -643,10 +642,19 @@ def _is_dir_entry(self, top: RemotePath, entry) -> bool: Show if the entry is a directory: - .. code:: python - - for component in connection._scan_entries(path="/a/path/to/the/directory"): - assert connection._is_dir_entry(root="/a/path/to/the/directory", entry) == True + >>> for entry in connection._scan_entries(path="/a/path/to/the/directory"): + ... break + >>> entry + { + 'created': '2023-12-08T18:33:39Z', + 'owner': None, + 'size': 0, + 'modified': '2023-12-08 18:33:20', + 'isdir': True, + 'path': '/path/to/the/directory', + } + >>> connection._is_dir_entry(root="/a/path/to/the/directory", entry) + True """ @abstractmethod @@ -674,10 +682,19 @@ def _is_file_entry(self, top: RemotePath, entry) -> bool: Show if the entry is a file: - .. code:: python - - for entry in connection._scan_entries(path="/a/path/to/the/directory"): - assert connection._is_file_entry(root="/a/path/to/the/directory", entry) == True + >>> for entry in connection._scan_entries(path="/a/path/to/the/directory"): + ... break + >>> entry + { + 'created': '2023-12-08T18:33:39Z', + 'owner': None, + 'size': 23, + 'modified': '2023-12-08 18:33:20', + 'isdir': False, + 'path': '/path/to/the/directory/file.txt', + } + >>> connection._is_file_entry(root="/a/path/to/the/directory", entry) + True """ @abstractmethod @@ -708,15 +725,25 @@ def _extract_stat_from_entry(self, top: RemotePath, entry) -> PathStatProtocol: Get statistics object from the entry: - .. code:: python - - for entry in connection._scan_entries(path="/a/path/to/the/directory"): - stat = connection._extract_stat_from_entry(root="/a/path/to/the/directory", entry) - - assert stat == RemotePathStat( - st_size=23, st_mtime=1670517693.0, st_mode=None, st_uid=None, st_gid=None - ) - + >>> for entry in connection._scan_entries(path="/a/path/to/the/directory"): + ... break + >>> entry + { + 'created': '2023-12-08T18:33:39Z', + 'owner': None, + 'size': 23, + 'modified': '2023-12-08 18:33:20', + 'isdir': False, + 'path': '/path/to/the/directory/file.txt', + } + >>> connection._extract_stat_from_entry(root="/a/path/to/the/directory", entry) + RemotePathStat( + st_size=23, + st_mtime=1670517693.0, + st_mode=None, + st_uid=None, + st_gid=None, + ) """ def _log_parameters(self): diff --git a/onetl/connection/file_connection/mixins/rename_dir_mixin.py b/onetl/connection/file_connection/mixins/rename_dir_mixin.py index c183ebf56..2684b0351 100644 --- a/onetl/connection/file_connection/mixins/rename_dir_mixin.py +++ b/onetl/connection/file_connection/mixins/rename_dir_mixin.py @@ -52,11 +52,13 @@ def rename_dir( Examples -------- - .. code:: python - - new_file = connection.rename_dir("/path/to/dir1", "/path/to/dir2") - assert connection.path_exists("/path/to/dir1") - assert not connection.path_exists("/path/to/dir2") + >>> new_dir = connection.rename_dir("/path/to/dir1", "/path/to/dir2") + >>> os.fspath(new_dir) + '/path/to/dir2' + >>> connection.path_exists("/path/to/dir1") + False + >>> connection.path_exists("/path/to/dir2") + True """ log.debug("|%s| Renaming directory '%s' to '%s'", self.__class__.__name__, source_dir_path, target_dir_path) diff --git a/onetl/file/file_downloader/file_downloader.py b/onetl/file/file_downloader/file_downloader.py index f179491e0..f429838df 100644 --- a/onetl/file/file_downloader/file_downloader.py +++ b/onetl/file/file_downloader/file_downloader.py @@ -249,7 +249,7 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DownloadResul Returns ------- - downloaded_files : :obj:`DownloadResult ` + :obj:`DownloadResult ` Download result object @@ -266,76 +266,76 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DownloadResul Examples -------- - Download files from ``source_path`` to ``local_path`` + Download files from ``source_path`` to ``local_path``: - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onetl.file import FileDownloader - - downloader = FileDownloader(source_path="/remote", local_path="/local", ...) - downloaded_files = downloader.run() - - assert downloaded_files.successful == { + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(source_path="/remote", local_path="/local", ...) + >>> download_result = downloader.run() + >>> download_result + DownloadResult( + successful=FileSet([ LocalPath("/local/file1.txt"), LocalPath("/local/file2.txt"), - LocalPath("/local/nested/path/file3.txt"), # directory structure is preserved - } - assert downloaded_files.failed == {FailedRemoteFile("/remote/failed.file")} - assert downloaded_files.skipped == {RemoteFile("/remote/already.exists")} - assert downloaded_files.missing == {RemotePath("/remote/missing.file")} - - Download only certain files from ``source_path`` - - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onetl.file import FileDownloader - - downloader = FileDownloader(source_path="/remote", local_path="/local", ...) - - # paths could be relative or absolute, but all should be in "/remote" - downloaded_files = downloader.run( - [ - "/remote/file1.txt", - "/remote/nested/path/file3.txt", - # excluding "/remote/file2.txt" - ] - ) + # directory structure is preserved + LocalPath("/local/nested/path/file3.txt"), + ]), + failed=FileSet([ + FailedRemoteFile("/remote/failed.file"), + ]), + skipped=FileSet([ + RemoteFile("/remote/already.exists"), + ]), + missing=FileSet([ + RemotePath("/remote/missing.file"), + ]), + ) - assert downloaded_files.successful == { + Download only certain files from ``source_path``: + + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(source_path="/remote", local_path="/local", ...) + >>> # paths could be relative or absolute, but all should be in "/remote" + >>> download_result = downloader.run( + ... [ + ... "/remote/file1.txt", + ... "/remote/nested/path/file3.txt", + ... # excluding "/remote/file2.txt" + ... ] + ... ) + >>> download_result + DownloadResult( + successful=FileSet([ LocalPath("/local/file1.txt"), - LocalPath("/local/nested/path/file3.txt"), # directory structure is preserved - } - assert not downloaded_files.failed - assert not downloaded_files.skipped - assert not downloaded_files.missing - - Download certain files from any folder - - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onetl.file import FileDownloader - - downloader = FileDownloader(local_path="/local", ...) # no source_path set - - # only absolute paths - downloaded_files = downloader.run( - [ - "/remote/file1.txt", - "/any/nested/path/file2.txt", - ] - ) + # directory structure is preserved + LocalPath("/local/nested/path/file3.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) - assert downloaded_files.successful == { + Download certain files from any folder: + + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(local_path="/local", ...) # no source_path set + >>> # only absolute paths + >>> download_result = downloader.run( + ... [ + ... "/remote/file1.txt", + ... "/any/nested/path/file2.txt", + ... ] + ... ) + >>> download_result + DownloadResult( + successful=FileSet([ LocalPath("/local/file1.txt"), - LocalPath("/local/file2.txt"), # directory structure is NOT preserved without source_path - } - assert not downloaded_files.failed - assert not downloaded_files.skipped - assert not downloaded_files.missing + LocalPath("/local/file2.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) """ entity_boundary_log(log, f"{self.__class__.__name__}.run() starts") @@ -417,22 +417,16 @@ def view_files(self) -> FileSet[RemoteFile]: Examples -------- - View files - - .. code:: python - - from onetl.impl import RemoteFile - from onetl.file import FileDownloader - - downloader = FileDownloader(source_path="/remote", ...) - - view_files = downloader.view_files() + View files: - assert view_files == { - RemoteFile("/remote/file1.txt"), - RemoteFile("/remote/file3.txt"), - RemoteFile("/remote/nested/file3.txt"), - } + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(source_path="/remote", ...) + >>> downloader.view_files() + FileSet([ + RemoteFile("/remote/file1.txt"), + RemoteFile("/remote/file3.txt"), + RemoteFile("/remote/nested/file3.txt"), + ]) """ if not self.source_path: diff --git a/onetl/file/file_downloader/result.py b/onetl/file/file_downloader/result.py index 754dcdc04..0fa3090e1 100644 --- a/onetl/file/file_downloader/result.py +++ b/onetl/file/file_downloader/result.py @@ -25,34 +25,33 @@ class DownloadResult(FileResult): Examples -------- - Download files - - .. code:: python - - from onetl.impl import LocalPath, RemoteFile, FailedLocalFile - from onetl.file import FileDownloader, DownloadResult - - downloader = FileDownloader(local_path="/local", ...) - - downloaded_files = downloader.run( - [ - "/remote/file1", - "/remote/file2", - "/failed/file", - "/existing/file", - "/missing/file", - ] - ) - - assert downloaded_files == DownloadResult( - successful={ - LocalPath("/local/file1"), - LocalPath("/local/file2"), - }, - failed={FailedLocalFile("/failed/file")}, - skipped={RemoteFile("/existing/file")}, - missing={RemotePath("/missing/file")}, - ) + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(local_path="/local", ...) + >>> download_result = downloader.run( + ... [ + ... "/remote/file1", + ... "/remote/file2", + ... "/failed/file", + ... "/existing/file", + ... "/missing/file", + ... ] + ... ) + >>> download_result + DownloadResult( + successful=FileSet([ + LocalPath("/local/file1"), + LocalPath("/local/file2"), + ]), + failed=FileSet([ + FailedLocalFile("/failed/file") + ]), + skipped=FileSet([ + RemoteFile("/existing/file") + ]), + missing=FileSet([ + RemotePath("/missing/file") + ]), + ) """ successful: FileSet[LocalPath] = Field(default_factory=FileSet) diff --git a/onetl/file/file_mover/file_mover.py b/onetl/file/file_mover/file_mover.py index 1c0b55dc7..b55f048d8 100644 --- a/onetl/file/file_mover/file_mover.py +++ b/onetl/file/file_mover/file_mover.py @@ -177,7 +177,7 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> MoveResult: Returns ------- - moved_files : :obj:`MoveResult ` + :obj:`MoveResult ` Move result object @@ -194,76 +194,76 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> MoveResult: Examples -------- - Move files from ``source_path`` + Move files from ``source_path``: - .. code:: python - - from onetl.impl import RemoteFile, RemotePath - from onetl.file import FileMover - - mover = FileMover(source_path="/source", target_path="/target", ...) - moved_files = mover.run() - - assert moved_files.successful == { + >>> from onetl.file import FileMover + >>> mover = FileMover(source_path="/source", target_path="/target", ...) + >>> move_result = mover.run() + >>> move_result + MoveResult( + successful=FileSet([ RemoteFile("/target/file1.txt"), RemoteFile("/target/file2.txt"), - RemoteFile("/target/nested/path/file3.txt"), # directory structure is preserved - } - assert moved_files.failed == {FailedRemoteFile("/source/failed.file")} - assert moved_files.skipped == {RemoteFile("/source/already.exists")} - assert moved_files.missing == {RemotePath("/source/missing.file")} - - Move only certain files from ``source_path`` - - .. code:: python - - from onetl.impl import RemoteFile - from onetl.file import FileMover - - mover = FileMover(source_path="/source", target_path="/target", ...) - - # paths could be relative or absolute, but all should be in "/source" - moved_files = mover.run( - [ - "/source/file1.txt", - "/source/nested/path/file3.txt", - # excluding "/source/file2.txt" - ] - ) + # directory structure is preserved + RemoteFile("/target/nested/path/file3.txt"), + ]), + failed=FileSet([ + FailedRemoteFile("/source/failed.file"), + ]), + skipped=FileSet([ + RemoteFile("/source/already.exists"), + ]), + missing=FileSet([ + RemotePath("/source/missing.file"), + ]), + ) - assert moved_files.successful == { + Move only certain files from ``source_path``: + + >>> from onetl.file import FileMover + >>> mover = FileMover(source_path="/source", target_path="/target", ...) + >>> # paths could be relative or absolute, but all should be in "/source" + >>> move_result = mover.run( + ... [ + ... "/source/file1.txt", + ... "/source/nested/path/file3.txt", + ... # excluding "/source/file2.txt" + ... ] + ... ) + >>> move_result + MoveResult( + successful=FileSet([ RemoteFile("/target/file1.txt"), - RemoteFile("/target/nested/path/file3.txt"), # directory structure is preserved - } - assert not moved_files.failed - assert not moved_files.skipped - assert not moved_files.missing - - Move certain files from any folder - - .. code:: python - - from onetl.impl import RemoteFile - from onetl.file import FileMover - - mover = FileMover(target_path="/target", ...) # no source_path set - - # only absolute paths - moved_files = mover.run( - [ - "/remote/file1.txt", - "/any/nested/path/file3.txt", - ] - ) + # directory structure is preserved + RemoteFile("/target/nested/path/file3.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) - assert moved_files.successful == { + Move certain files from any folder: + + >>> from onetl.file import FileMover + >>> mover = FileMover(target_path="/target", ...) # no source_path set + >>> # only absolute paths + >>> move_result = mover.run( + ... [ + ... "/remote/file1.txt", + ... "/any/nested/path/file3.txt", + ... ] + ... ) + >>> move_result + MoveResult( + successful=FileSet([ RemoteFile("/target/file1.txt"), - RemoteFile("/target/file3.txt"), # directory structure is NOT preserved without source_path - } - assert not moved_files.failed - assert not moved_files.skipped - assert not moved_files.missing + RemoteFile("/target/file3.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) """ entity_boundary_log(log, f"{self.__class__.__name__}.run() starts") @@ -327,22 +327,16 @@ def view_files(self) -> FileSet[RemoteFile]: Examples -------- - View files - - .. code:: python - - from onetl.impl import RemoteFile - from onetl.file import FileMover - - mover = FileMover(source_path="/remote", ...) - - view_files = mover.view_files() + View files: - assert view_files == { - RemoteFile("/remote/file1.txt"), - RemoteFile("/remote/file3.txt"), - RemoteFile("/remote/nested/path/file3.txt"), - } + >>> from onetl.file import FileMover + >>> mover = FileMover(source_path="/remote", ...) + >>> mover.view_files() + FileSet([ + RemoteFile("/remote/file1.txt"), + RemoteFile("/remote/file2.txt"), + RemoteFile("/remote/nested/path/file3.txt"), + ]) """ if not self.source_path: diff --git a/onetl/file/file_mover/result.py b/onetl/file/file_mover/result.py index 90175a2d4..4c2456999 100644 --- a/onetl/file/file_mover/result.py +++ b/onetl/file/file_mover/result.py @@ -25,34 +25,33 @@ class MoveResult(FileResult): Examples -------- - Move files - - .. code:: python - - from onetl.impl import RemotePath, RemoteFile, FailedLocalFile - from onetl.file import FileMover, MoveResult - - mover = FileMover(local_path="/local", ...) - - moved_files = mover.run( - [ - "/source/file1", - "/source/file2", - "/failed/file", - "/existing/file", - "/missing/file", - ] - ) - - assert moved_files == MoveResult( - successful={ - RemoteFile("/target/file1"), - RemoteFile("/target/file2"), - }, - failed={FailedLocalFile("/failed/file")}, - skipped={RemoteFile("/existing/file")}, - missing={RemotePath("/missing/file")}, - ) + >>> from onetl.file import FileMover + >>> mover = FileMover(local_path="/local", ...) + >>> move_result = mover.run( + ... [ + ... "/source/file1", + ... "/source/file2", + ... "/failed/file", + ... "/existing/file", + ... "/missing/file", + ... ] + ... ) + >>> move_result + MoveResult( + successful=FileSet([ + RemoteFile("/target/file1"), + RemoteFile("/target/file2"), + ]), + failed=FileSet([ + FailedLocalFile("/failed/file") + ]), + skipped=FileSet([ + RemoteFile("/existing/file") + ]), + missing=FileSet([ + RemotePath("/missing/file") + ]), + ) """ successful: FileSet[RemoteFile] = Field(default_factory=FileSet) diff --git a/onetl/file/file_result.py b/onetl/file/file_result.py index 2841741f9..d2b9aec6e 100644 --- a/onetl/file/file_result.py +++ b/onetl/file/file_result.py @@ -61,16 +61,13 @@ def successful_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.successful_count == 2 + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.successful_count + 2 """ return len(self.successful) @@ -83,16 +80,13 @@ def failed_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile - from onet.file.file_result import FileResult - - file_result = FileResult( - failed={RemoteFile("/some/file"), RemoteFile("/some/another.file")}, - ) - - assert file_result.failed_count == 2 + >>> from onetl.impl import RemoteFile + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... failed={RemoteFile("/some/file"), RemoteFile("/some/another.file")}, + ... ) + >>> file_result.failed_count + 2 """ return len(self.failed) @@ -105,16 +99,13 @@ def skipped_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - skipped={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.skipped_count == 2 + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... skipped={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.skipped_count + 2 """ return len(self.skipped) @@ -127,16 +118,13 @@ def missing_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - missing={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.missing_count == 2 + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... missing={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.missing_count + 2 """ return len(self.missing) @@ -149,19 +137,16 @@ def total_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - failed={RemoteFile("/remote/file"), RemoteFile("/remote/another.file")}, - skipped={LocalPath("/skipped/file")}, - missing={LocalPath("/missing/file")}, - ) - - assert file_result.total_count == 6 + >>> from onetl.impl import RemoteFile + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, + ... failed={RemoteFile("/remote/file"), RemoteFile("/remote/another.file")}, + ... skipped={LocalPath("/skipped/file")}, + ... missing={LocalPath("/missing/file")}, + ... ) + >>> file_result.total_count + 6 """ return self.successful_count + self.failed_count + self.missing_count + self.skipped_count @@ -174,16 +159,13 @@ def successful_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.successful_size == 1_000_000 # in bytes + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.successful_size # in bytes + 1024 """ return self.successful.total_size @@ -196,16 +178,16 @@ def failed_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile - from onet.file.file_result import FileResult - - file_result = FileResult( - failed={RemoteFile("/some/file"), RemoteFile("/some/another.file")}, - ) - - assert file_result.failed_size == 1_000_000 # in bytes + >>> from onetl.impl import RemoteFile, RemotePathStat + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... failed={ + ... RemoteFile("/some/file", stats=RemotePathStat(st_size=1024)), + ... RemoteFile("/some/another.file"), stats=RemotePathStat(st_size=1024)), + ... }, + ... ) + >>> file_result.failed_size # in bytes + 2048 """ return self.failed.total_size @@ -218,16 +200,13 @@ def skipped_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - skipped={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.skipped_size == 1_000_000 # in bytes + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... skipped={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.skipped_size # in bytes + 1024 """ return self.skipped.total_size @@ -240,19 +219,19 @@ def total_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - failed={RemoteFile("/remote/file"), RemoteFile("/remote/another.file")}, - skipped={LocalPath("/skipped/file")}, - missing={LocalPath("/missing/file")}, - ) - - assert file_result.total_size == 10_000_000 # in bytes + >>> from onetl.impl import RemoteFile, RemotePathStat, LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, + ... failed={ + ... RemoteFile("/remote/file", stats=RemotePathStat(st_size=1024)), + ... RemoteFile("/remote/another.file", stats=RemotePathStat(st_size=1024)) + ... }, + ... skipped={LocalPath("/skipped/file")}, + ... missing={LocalPath("/missing/file")}, + ... ) + >>> file_result.total_size # in bytes + 4096 """ return self.successful_size + self.failed_size + self.skipped_size @@ -270,33 +249,31 @@ def raise_if_failed(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - files_with_exception = [ - FailedRemoteFile( - path="/remote/file1", - exception=NotAFileError("'/remote/file1' is not a file"), - ), - FailedRemoteFile( - path="/remote/file2", - exception=FileMissingError("'/remote/file2' does not exist"), - ), - ] - - file_result = FileResult(failed=files_with_exception) - - file_result.raise_if_failed() - # will raise FailedFilesError(''' - # Failed 2 files (10MB): - # '/remote/file1' (1 MB) - # NotAFileError("'/remote/file1' is not a file") - # - # '/remote/file2' (9 MB) - # FileMissingError("'/remote/file2' does not exist") - # ''') + >>> from onetl.impl import FailedRemoteFile, RemotePathStat + >>> from onetl.exception import NotAFileError, FileMissingError + >>> from onetl.file.file_result import FileResult + >>> files_with_exception = [ + ... FailedRemoteFile( + ... path="/remote/file1", + ... stats=RemotePathStat(st_size=0), + ... exception=NotAFileError("'/remote/file1' is not a file"), + ... ), + ... FailedRemoteFile( + ... path="/remote/file2", + ... stats=RemotePathStat(st_size=0), + ... exception=PermissionError("'/remote/file2': [Errno 13] Permission denied"), + ... ), + ... ] + >>> file_result = FileResult(failed=files_with_exception) + >>> file_result.raise_if_failed() + Traceback (most recent call last) + ... + onetl.exception.FailedFilesError: Failed 2 files (size='0 bytes'): + '/remote/file1' (size='0 bytes') + NotAFileError("'/remote/file1' is not a file") + + '/remote/file2' (size='0 Bytes') + PermissionError("'/remote/file2': [Errno 13] Permission denied") """ if self.failed: @@ -315,24 +292,20 @@ def raise_if_missing(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - missing={ - LocalPath("/missing/file1"), - LocalPath("/missing/file2"), - }, - ) - - file_result.raise_if_missing() - # will raise MissingFilesError(''' - # Missing 2 files: - # '/missing/file1' - # '/missing/file2' - # ''') + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... missing={ + ... LocalPath("/missing/file1"), + ... LocalPath("/missing/file2"), + ... }, + ... ) + >>> file_result.raise_if_missing() + Traceback (most recent call last): + ... + onetl.exception.MissingFilesError: Missing 2 files: + '/missing/file1' + '/missing/file2' """ if self.missing: @@ -351,21 +324,20 @@ def raise_if_skipped(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - skipped={LocalPath("/skipped/file1"), LocalPath("/skipped/file2")}, - ) - - file_result.raise_if_skipped() - # will raise SkippedFilesError(''' - # Skipped 2 files (15 kB): - # '/skipped/file1' (10kB) - # '/skipped/file2' (5 kB) - # ''') + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... skipped={ + ... LocalPath("/skipped/file1"), + ... LocalPath("/skipped/file2"), + ... }, + ... ) + >>> file_result.raise_if_skipped() + Traceback (most recent call last): + ... + onetl.exception.SkippedFilesError: Skipped 2 files (15 kB): + '/skipped/file1' (10kB) + '/skipped/file2' (5 kB) """ if self.skipped: @@ -384,25 +356,22 @@ def raise_if_contains_zero_size(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={ - LocalPath("/local/empty1.file"), - LocalPath("/local/empty2.file"), - LocalPath("/local/normal.file"), - }, - ) - - file_result.raise_if_contains_zero_size() - # will raise ZeroFileSizeError(''' - # 2 files out of 3 have zero size: - # '/local/empty1.file' - # '/local/empty2.file' - # ''') + >>> from onetl.exception import ZeroFileSizeError + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={ + ... LocalPath("/local/empty1.file"), + ... LocalPath("/local/empty2.file"), + ... LocalPath("/local/normal.file"), + ... }, + ... ) + >>> file_result.raise_if_contains_zero_size() + Traceback (most recent call last): + ... + onetl.exception.ZeroFileSizeError: 2 files out of 3 have zero size: + '/local/empty1.file' + '/local/empty2.file' """ self.successful.raise_if_contains_zero_size() @@ -415,18 +384,16 @@ def is_empty(self) -> bool: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result1 = FileResult() - assert file_result1.is_empty - - file_result2 = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - ) - assert not file_result2.is_empty + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result1 = FileResult() + >>> file_result1.is_empty + True + >>> file_result2 = FileResult( + ... successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, + ... ) + >>> file_result2.is_empty + False """ return not self.failed and not self.successful and not self.skipped @@ -444,15 +411,12 @@ def raise_if_empty(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult() - - file_result.raise_if_empty() - # will raise EmptyFilesError("There are no files in the result") + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult() + >>> file_result.raise_if_empty() + Traceback (most recent call last): + ... + onetl.exception.EmptyFilesError: There are no files in the result """ if self.is_empty: @@ -460,71 +424,66 @@ def raise_if_empty(self) -> None: @property def details(self) -> str: - ''' + """ Return detailed information about files in the result object Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result1 = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - failed={ - FailedRemoteFile( - path="/remote/file1", - exception=NotAFileError("'/remote/file1' is not a file"), - ), - FailedRemoteFile( - path="/remote/file2", - exception=FileMissingError("'/remote/file2' does not exist"), - ), - }, - skipped={LocalPath("/skipped/file1"), LocalPath("/skipped/file2")}, - missing={LocalPath("/missing/file1"), LocalPath("/missing/file2")}, - ) - - details1 = """ - Total: 8 files (10.4 MB) - - Successful 2 files (30.7 kB): - '/successful1' (10.2 kB) - '/successful2' (20.5 kB) - - Failed 2 files (10MB): - '/remote/file1' (1 MB) - NotAFileError("'/remote/file1' is not a file") - - '/remote/file2' (9 MB) - FileMissingError("'/remote/file2' does not exist") - - Skipped 2 files (15 kB): - '/skipped/file1' (10kB) - '/skipped/file2' (5 kB) - - Missing 2 files: - '/missing/file1' - '/missing/file2' - """ - - assert file_result1.details == details1 - - file_result2 = FileResult() - details2 = """ - No successful files - - No failed files - - No skipped files - - No missing files - """ - - assert file_result2.details == details2 - ''' + >>> from onetl.impl import FailedRemoteFile, LocalPath, RemoteFile, RemotePathStat + >>> from onetl.exception import NotAFileError + >>> from onetl.file.file_result import FileResult + >>> file_result1 = FileResult( + ... successful={ + ... RemoteFile("/local/file", stats=RemotePathStat(st_size=1024)), + ... RemoteFile("/local/another.file", stats=RemotePathStat(st_size=1024)), + ... }, + ... failed={ + ... FailedRemoteFile( + ... path="/remote/file1", + ... stats=RemotePathStat(st_size=0), + ... exception=NotAFileError("'/remote/file1' is not a file"), + ... ), + ... FailedRemoteFile( + ... path="/remote/file2", + ... stats=RemotePathStat(st_size=0), + ... exception=PermissionError("'/remote/file2': [Errno 13] Permission denied"), + ... ), + ... }, + ... skipped={LocalPath("/skipped/file1"), LocalPath("/skipped/file2")}, + ... missing={LocalPath("/missing/file1"), LocalPath("/missing/file2")}, + ... ) + >>> print(file_result1.details) + Total: 8 files (size='2.0 kB') + + Successful 2 files (size='2.0 kB'): + '/local/another.file' (size='1.0 kB') + '/local/file' (size='1.0 kB') + + Failed 2 files (size='0 Bytes'): + '/remote/file2' (size='0 Bytes') + PermissionError("'/remote/file2': [Errno 13] Permission denied") + '/remote/file1' (size='0 Bytes') + NotAFileError("'/remote/file1' is not a file") + + Skipped 2 files (size='0 Bytes'): + '/skipped/file1' + '/skipped/file2' + + Missing 2 files: + '/missing/file2' + '/missing/file1' + + >>> file_result2 = FileResult() + >>> print(file_result2.details) + No successful files + + No failed files + + No skipped files + + No missing files + """ result = [] @@ -540,41 +499,50 @@ def details(self) -> str: @property def summary(self) -> str: - ''' + """ Return short summary about files in the result object Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result1 = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - failed={RemoteFile("/remote/file"), RemoteFile("/remote/another.file")}, - skipped={LocalPath("/skipped/file")}, - missing={LocalPath("/missing/file")}, - ) - - result = """ - Total: 8 files (10.4 MB) - - Successful: 2 files (30.7 kB) - - Failed: 2 files (10MB) - - Skipped: 2 files (15 kB) - - Missing: 2 files - """ - - assert file_result1.summary == result - - file_result2 = FileResult() - assert file_result1.summary == "No files" - ''' + >>> from onetl.impl import FailedRemoteFile, LocalPath, RemoteFile, RemotePathStat + >>> from onetl.exception import NotAFileError + >>> from onetl.file.file_result import FileResult + >>> file_result1 = FileResult( + ... successful={ + ... RemoteFile("/local/file", stats=RemotePathStat(st_size=1024)), + ... RemoteFile("/local/another.file", stats=RemotePathStat(st_size=1024)), + ... }, + ... failed={ + ... FailedRemoteFile( + ... path="/remote/file1", + ... stats=RemotePathStat(st_size=0), + ... exception=NotAFileError("'/remote/file1' is not a file"), + ... ), + ... FailedRemoteFile( + ... path="/remote/file2", + ... stats=RemotePathStat(st_size=0), + ... exception=PermissionError("'/remote/file2': [Errno 13] Permission denied"), + ... ), + ... }, + ... skipped={LocalPath("/skipped/file1"), LocalPath("/skipped/file2")}, + ... missing={LocalPath("/missing/file1"), LocalPath("/missing/file2")}, + ... ) + >>> print(file_result1.summary) + Total: 8 files (size='2.0 kB') + + Successful: 2 files (size='2.0 kB') + + Failed: 2 files (size='0 Bytes') + + Skipped: 2 files (size='0 Bytes') + + Missing: 2 files + + >>> file_result2 = FileResult() + >>> print(file_result2.summary) + No files + """ return self._total_message def __str__(self): diff --git a/onetl/file/file_set.py b/onetl/file/file_set.py index 4cbf86ea5..2447ad04f 100644 --- a/onetl/file/file_set.py +++ b/onetl/file/file_set.py @@ -33,14 +33,11 @@ def total_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_set import FileSet - - file_set = FileSet({LocalPath("/some/file"), LocalPath("/some/another.file")}) - - assert path_set.total_size == 1_000_000 # in bytes + >>> from onetl.impl import LocalPath + >>> from onet.file.file_set import FileSet + >>> file_set = FileSet({LocalPath("/some/file"), LocalPath("/some/another.file")}) + >>> path_set.total_size # in bytes + 1024 """ return sum( @@ -60,14 +57,12 @@ def raise_if_empty(self) -> None: Examples -------- - .. code:: python - - from onet.file.file_set import FileSet - - file_set = FileSet() - - file_set.raise_if_empty() - # will raise EmptyFilesError("There are no files in the set") + >>> from onet.file.file_set import FileSet + >>> file_set = FileSet() + >>> file_set.raise_if_empty() + Traceback (most recent call last): + ... + onetl.exception.EmptyFilesError: There are no files in the set """ if not self: @@ -86,23 +81,19 @@ def raise_if_contains_zero_size(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_set import FileSet - - file_set = FileSet( - LocalPath("/local/empty1.file"), - LocalPath("/local/empty2.file"), - LocalPath("/local/normal.file"), - ) - - file_set.raise_if_contains_zero_size() - # will raise ZeroFileSizeError(''' - # 2 files out of 3 have zero size: - # '/local/empty1.file' - # '/local/empty2.file' - # ''') + >>> from onetl.impl import RemoteFile, LocalPath + >>> from onet.file.file_set import FileSet + >>> file_set = FileSet( + ... LocalPath("/local/empty1.file"), + ... LocalPath("/local/empty2.file"), + ... LocalPath("/local/normal.file"), + ... ) + >>> file_set.raise_if_contains_zero_size() + Traceback (most recent call last): + ... + onetl.exception.ZeroFileSizeError: 2 files out of 3 have zero size: + '/local/empty1.file' + '/local/empty2.file' """ lines = [] @@ -132,21 +123,19 @@ def summary(self) -> str: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_set import FileSet - - path_set1 = FileSet( - [ - LocalPath("/local/file"), - LocalPath("/local/another.file"), - ] - ) - - assert path_set1.summary == "2 files (30.7 kB)" - - assert FileSet().summary == "No files" + >>> from onetl.impl import LocalPath + >>> from onet.file.file_set import FileSet + >>> path_set1 = FileSet( + ... [ + ... LocalPath("/local/file"), + ... LocalPath("/local/another.file"), + ... ] + ... ) + >>> print(path_set1.summary) + 2 files (30.7 kB) + >>> path_set2 = FileSet() + >>> print(path_set2.summary) + No files """ if not self: @@ -156,35 +145,30 @@ def summary(self) -> str: return f"{file_number_str} (size='{naturalsize(self.total_size)}')" @property - def details(self) -> str: - ''' + def details(self) -> str: # noqa: WPS473 + """ Return detailed information about files in the set Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_set import FileSet - - path_set1 = FileSet( - [ - LocalPath("/local/file"), - LocalPath("/local/another.file"), - ] - ) - - details1 = """ - 2 files (30.7 kB): - '/local/file' (10.2 kB) - '/local/another.file' (20.5 kB) - """ - - assert path_set1.details == details1 - - assert FileSet().details == "No files" - ''' + >>> from onetl.impl import LocalPath + >>> from onet.file.file_set import FileSet + >>> path_set1 = FileSet( + ... [ + ... LocalPath("/local/file"), + ... LocalPath("/local/another.file"), + ... ] + ... ) + >>> print(path_set1.details) + 2 files (30.7 kB): + '/local/file' (10.2 kB) + '/local/another.file' (20.5 kB) + + >>> path_set2 = FileSet() + >>> print(path_set2.details) + No files + """ if not self: return self.summary diff --git a/onetl/file/file_uploader/file_uploader.py b/onetl/file/file_uploader/file_uploader.py index ba107310b..819d357f5 100644 --- a/onetl/file/file_uploader/file_uploader.py +++ b/onetl/file/file_uploader/file_uploader.py @@ -161,7 +161,7 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> UploadResult: Returns ------- - uploaded_files : :obj:`UploadResult ` + :obj:`UploadResult ` Upload result object @@ -182,85 +182,76 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> UploadResult: Examples -------- - Upload files from ``local_path`` to ``target_path`` + Upload files from ``local_path`` to ``target_path``: - .. code:: python - - from onetl.impl import ( - RemoteFile, - LocalPath, - ) - from onetl.file import FileUploader - - uploader = FileUploader(local_path="/local", target_path="/remote", ...) - uploaded_files = uploader.run() - - assert uploaded_files.successful == { + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(local_path="/local", target_path="/remote", ...) + >>> upload_result = uploader.run() + >>> upload_result + UploadResult( + successful=FileSet([ RemoteFile("/remote/file1"), RemoteFile("/remote/file2"), - RemoteFile("/remote/nested/path/file3"), # directory structure is preserved - } - assert uploaded_files.failed == {FailedLocalFile("/local/failed.file")} - assert uploaded_files.skipped == {LocalPath("/local/already.exists")} - assert uploaded_files.missing == {LocalPath("/local/missing.file")} - - Upload only certain files from ``local_path`` - - .. code:: python - - from onetl.impl import ( - RemoteFile, - LocalPath, - ) - from onetl.file import FileUploader - - uploader = FileUploader(local_path="/local", target_path="/remote", ...) - - # paths could be relative or absolute, but all should be in "/local" - uploaded_files = uploader.run( - [ - "/local/file1", - "/local/nested/path/file3", - # excluding "/local/file2", - ] - ) + # directory structure is preserved + RemoteFile("/remote/nested/path/file3") + ]), + failed=FileSet([ + FailedLocalFile("/local/failed.file"), + ]), + skipped=FileSet([ + LocalPath("/local/already.exists"), + ]), + missing=FileSet([ + LocalPath("/local/missing.file"), + ]), + ) - assert uploaded_files.successful == { + Upload only certain files from ``local_path``: + + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(local_path="/local", target_path="/remote", ...) + >>> # paths could be relative or absolute, but all should be in "/local" + >>> upload_result = uploader.run( + ... [ + ... "/local/file1", + ... "/local/nested/path/file3", + ... # excluding "/local/file2", + ... ] + ... ) + >>> upload_result + UploadResult( + successful=FileSet([ RemoteFile("/remote/file1"), - RemoteFile("/remote/nested/path/file3"), # directory structure is preserved - } - assert not uploaded_files.failed - assert not uploaded_files.skipped - assert not uploaded_files.missing - - Upload only certain files from any folder - - .. code:: python - - from onetl.impl import ( - RemoteFile, - LocalPath, - ) - from onetl.file import FileUploader - - uploader = FileUploader(target_path="/remote", ...) # no local_path set - - # only absolute paths - uploaded_files = uploader.run( - [ - "/local/file1.txt", - "/any/nested/path/file3.txt", - ] - ) + # directory structure is preserved + RemoteFile("/remote/nested/path/file3"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) - assert uploaded_files.successful == { - RemoteFile("/remote/file1"), - RemoteFile("/remote/file3"), + Upload only certain files from any folder: + + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(target_path="/remote", ...) # no local_path set + >>> # only absolute paths + >>> upload_result = uploader.run( + ... [ + ... "/local/file1.txt", + ... "/any/nested/path/file3.txt", + ... ] + ... ) + >>> upload_result + UploadResult( + successful=FileSet([ + RemoteFile("/remote/file1.txt"), # directory structure is NOT preserved without local_path - } - assert not uploaded_files.failed - assert not uploaded_files.skipped - assert not uploaded_files.missing + RemoteFile("/remote/file3.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) """ entity_boundary_log(log, f"{self.__class__.__name__}.run() starts") @@ -332,22 +323,16 @@ def view_files(self) -> FileSet[LocalPath]: Examples -------- - View files - - .. code:: python - - from onetl.impl import LocalPath - from onetl.file import FileUploader - - uploader = FileUploader(local_path="/local", ...) - - view_files = uploader.view_files() + View files: - assert view_files == { - LocalPath("/local/file1.txt"), - LocalPath("/local/file3.txt"), - LocalPath("/local/nested/path/file3.txt"), - } + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(local_path="/local", ...) + >>> uploader.view_files() + FileSet([ + LocalPath("/local/file1.txt"), + LocalPath("/local/file3.txt"), + LocalPath("/local/nested/path/file3.txt"), + ]) """ if not self.local_path: diff --git a/onetl/file/file_uploader/result.py b/onetl/file/file_uploader/result.py index 9a785c719..e3b068190 100644 --- a/onetl/file/file_uploader/result.py +++ b/onetl/file/file_uploader/result.py @@ -25,34 +25,33 @@ class UploadResult(FileResult): Examples -------- - Upload files - - .. code:: python - - from onetl.impl import LocalPath, RemoteFile, FailedLocalFile - from onetl.file import FileUploader, UploadResult - - uploader = FileUploader(target_path="/remote", ...) - - uploaded_files = uploader.run( - [ - "/local/file1", - "/local/file2", - "/failed/file", - "/existing/file", - "/missing/file", - ] - ) - - assert uploaded_files == UploadResult( - successful={ - RemoteFile("/remote/file1"), - RemoteFile("/remote/file2"), - }, - failed={FailedLocalFile("/failed/file")}, - skipped={LocalPath("/existing/file")}, - missing={LocalPath("/missing/file")}, - ) + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(target_path="/remote", ...) + >>> upload_result = uploader.run( + ... [ + ... "/local/file1", + ... "/local/file2", + ... "/failed/file", + ... "/existing/file", + ... "/missing/file", + ... ] + ... ) + >>> upload_result + UploadResult( + successful=FileSet([ + RemoteFile("/remote/file1"), + RemoteFile("/remote/file2"), + ]), + failed=FileSet([ + FailedLocalFile("/failed/file") + ]), + skipped=FileSet([ + LocalPath("/existing/file") + ]), + missing=FileSet([ + LocalPath("/missing/file") + ]), + ) """ successful: FileSet[RemoteFile] = Field(default_factory=FileSet) diff --git a/onetl/file/filter/match_all_filters.py b/onetl/file/filter/match_all_filters.py index e5c6a7d56..6695bbf2a 100644 --- a/onetl/file/filter/match_all_filters.py +++ b/onetl/file/filter/match_all_filters.py @@ -31,16 +31,15 @@ def match_all_filters(path: PathProtocol, filters: Iterable[BaseFileFilter]) -> Examples -------- - .. code:: python - - from onetl.file.filter import Glob, ExcludeDir, match_all_filters - from onetl.impl import LocalPath - - filters = [Glob("*.csv"), ExcludeDir("/excluded")] - - assert match_all_filters(LocalPath("/path/to/file.csv"), filters) - assert not match_all_filters(LocalPath("/path/to/file.txt"), filters) - assert not match_all_filters(LocalPath("/excluded/path/file.csv"), filters) + >>> from onetl.file.filter import Glob, ExcludeDir, match_all_filters + >>> from onetl.impl import RemoteFile, RemotePathStat + >>> filters = [Glob("*.csv"), ExcludeDir("/excluded")] + >>> match_all_filters(RemoteFile("/path/to/file.csv", stats=RemotePathStat()), filters) + True + >>> match_all_filters(RemoteFile("/path/to/file.txt", stats=RemotePathStat()), filters) + False + >>> match_all_filters(RemoteFile("/excluded/path/file.csv", stats=RemotePathStat()), filters) + False """ empty = True diff --git a/onetl/file/limit/limits_reached.py b/onetl/file/limit/limits_reached.py index fe3d98b85..2f3833d63 100644 --- a/onetl/file/limit/limits_reached.py +++ b/onetl/file/limit/limits_reached.py @@ -28,18 +28,17 @@ def limits_reached(limits: Iterable[BaseFileLimit]) -> bool: Examples -------- - .. code:: python - - from onetl.file.limit import MaxFilesCount, limits_reached, limits_stop_at - from onetl.impl import LocalPath - - limits = [MaxFilesCount(2)] - assert not limits_reached(limits) - - assert not limits_stop_at(LocalPath("/path/to/file.csv"), limits) - assert limits_stop_at(LocalPath("/path/to/file.csv"), limits) - - assert limits_reached(limits) + >>> from onetl.file.limit import MaxFilesCount, limits_reached, limits_stop_at + >>> from onetl.impl import LocalPath + >>> limits = [MaxFilesCount(2)] + >>> limits_reached(limits) + False + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + False + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + True + >>> limits_reached(limits) + True """ debug = log.isEnabledFor(logging.DEBUG) diff --git a/onetl/file/limit/limits_stop_at.py b/onetl/file/limit/limits_stop_at.py index 472c94426..2ff1f7945 100644 --- a/onetl/file/limit/limits_stop_at.py +++ b/onetl/file/limit/limits_stop_at.py @@ -31,15 +31,13 @@ def limits_stop_at(path: PathProtocol, limits: Iterable[BaseFileLimit]) -> bool: Examples -------- - .. code:: python - - from onetl.file.limit import MaxFilesCount, limits_stop_at - from onetl.impl import LocalPath - - limits = [MaxFilesCount(1)] - - assert not limits_stop_at(LocalPath("/path/to/file.csv"), limits) - assert limits_stop_at(LocalPath("/path/to/file.csv"), limits) + >>> from onetl.file.limit import MaxFilesCount, limits_stop_at + >>> from onetl.impl import LocalPath + >>> limits = [MaxFilesCount(2)] + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + False + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + True """ reached = [] for limit in limits: diff --git a/onetl/file/limit/reset_limits.py b/onetl/file/limit/reset_limits.py index 4a30ca445..de9201da8 100644 --- a/onetl/file/limit/reset_limits.py +++ b/onetl/file/limit/reset_limits.py @@ -29,18 +29,18 @@ def reset_limits(limits: Iterable[BaseFileLimit]) -> list[BaseFileLimit]: Examples -------- - .. code:: python - - from onetl.file.limit import MaxFilesCount, limits_reached, reset_limits - from onetl.impl import LocalPath - - limits = [MaxFilesCount(1)] - - assert not limits_reached(limits) - # do something - assert limits_reached(limits) - - new_limits = reset_limits(limits) - assert not limits_reached(new_limits) + >>> from onetl.file.limit import MaxFilesCount, limits_reached, limits_stop_at, reset_limits + >>> from onetl.impl import LocalPath + >>> limits = [MaxFilesCount(1)] + >>> limits_reached(limits) + False + >>> # do something + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + True + >>> limits_reached(limits) + True + >>> new_limits = reset_limits(limits) + >>> limits_reached(new_limits) + False """ return [limit.reset() for limit in limits] diff --git a/onetl/hooks/hook.py b/onetl/hooks/hook.py index c0e9a9f25..f5163be65 100644 --- a/onetl/hooks/hook.py +++ b/onetl/hooks/hook.py @@ -84,13 +84,13 @@ def enable(self): Examples -------- - .. code:: python - - hook = Hook(..., enabled=False) - assert not hook.enabled - - hook.enable() - assert hook.enabled + >>> def func1(): ... + >>> hook = Hook(callback=func1, enabled=False) + >>> hook.enabled + False + >>> hook.enable() + >>> hook.enabled + True """ if self.enabled: logger.log( @@ -110,13 +110,13 @@ def disable(self): Examples -------- - .. code:: python - - hook = Hook(..., enabled=True) - assert hook.enabled - - hook.disable() - assert not hook.enabled + >>> def func1(): ... + >>> hook = Hook(callback=func1, enabled=True) + >>> hook.enabled + True + >>> hook.disable() + >>> hook.enabled + False """ if self.enabled: logger.log(NOTICE, "|Hooks| Disable hook '%s.%s'", self.callback.__module__, self.callback.__qualname__) @@ -146,30 +146,33 @@ def skip(self): .. tabs:: - .. code-tab:: py Context manager syntax - - hook = Hook(..., enabled=True) - assert hook.enabled - - with hook.skip(): - assert not hook.enabled - - # hook state is restored as it was before entering the context manager - assert hook.enabled - - .. code-tab:: py Decorator syntax - - hook = Hook(..., enabled=True) - assert hook.enabled - - @hook.skip() - def hook_disabled(): - assert not hook.enabled - - hook_disabled() - - # hook state is restored as it was before entering the context manager - assert hook.enabled + .. tab:: Context manager syntax + + >>> def func1(): ... + >>> hook = Hook(callback=func1, enabled=True) + >>> hook.enabled + True + >>> with hook.skip(): + ... print(hook.enabled) + False + >>> # hook state is restored as it was before entering the context manager + >>> hook.enabled + True + + .. tab:: Decorator syntax + + >>> def func1(): ... + >>> hook = Hook(callback=func1, enabled=True) + >>> hook.enabled + True + >>> @hook.skip() + ... def hook_disabled(): + ... print(hook.enabled) + >>> hook_disabled() + False + >>> # hook state is restored as it was before entering the context manager + >>> hook.enabled + True """ if not self.enabled: logger.log( @@ -205,18 +208,17 @@ def __call__(self, *args, **kwargs) -> T | ContextDecorator: Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook, HookPriority - - - def some_func(*args, **kwargs): ... - - - hook = Hook(callback=some_func) - - result = hook(1, "abc", some="arg") - assert result == some_func(1, "abc", some="arg") + >>> from onetl.hooks.hook import Hook, HookPriority + >>> def some_func(*args, **kwargs): + ... print(args) + ... print(kwargs) + ... return "func result" + >>> hook = Hook(callback=some_func) + >>> result = hook(1, "abc", some="arg") + (1, 'abc') + {'some': 'arg'} + >>> result + 'func result' """ result = self.callback(*args, **kwargs) if isinstance(result, Generator): diff --git a/onetl/hooks/hook_collection.py b/onetl/hooks/hook_collection.py index 6c6643122..f5b913966 100644 --- a/onetl/hooks/hook_collection.py +++ b/onetl/hooks/hook_collection.py @@ -34,26 +34,25 @@ def __init__(self, hooks: list[Hook] | HookCollection | None = None): @property def active(self): """ - Return HookCollection but containing only hooks with enabled state. + Return new HookCollection but containing only hooks with ``enabled=True`` state. If called after :obj:`~stop` or inside :obj:`~skip`, empty collection will be returned. Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=False), - ] - ) - - assert hooks.active == HookCollection([Hook(callback=func1)]) + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1, enabled=True), + ... Hook(callback=func2, enabled=False), + ... ], + ... ) + >>> len(hooks.active) + 1 """ if self._enabled: @@ -68,20 +67,19 @@ def stop(self) -> None: Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1), - Hook(callback=func2), - ] - ) - - hooks.stop() - assert hooks.active == HookCollection() + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... Hook(callback=func2), + ... ], + ... ) + >>> hooks.stop() + >>> hooks.active + HookCollection([]) """ self._enabled = False @@ -97,26 +95,24 @@ def resume(self) -> None: Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1), - Hook(callback=func2), - ] - ) - - hooks.resume() - - assert hooks.active == HookCollection( - [ - Hook(callback=func1), - Hook(callback=func2), - ] - ) + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... Hook(callback=func2), + ... ], + ... ) + >>> hooks.resume() + >>> hooks.active # doctest: +SKIP + HookCollection( + [ + Hook(callback=func1), + Hook(callback=func2), + ] + ) """ self._enabled = True @@ -135,25 +131,26 @@ def skip(self): Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1), - Hook(callback=func2), - ] - ) - - # hooks state is same as created by constructor - - with hooks.skip(): - # all hooks are disabled here - ... - - # hooks state is restored as it was before entering the context manager + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... Hook(callback=func2), + ... ], + ... ) + >>> # hooks state is same as created by constructor + >>> len(hooks.active) + 2 + >>> with hooks.skip(): + ... # all hooks are disabled here + ... print(len(hooks.active)) + 0 + >>> # hooks state is restored as it was before entering the context manager + >>> len(hooks.active) + 2 """ if not self._enabled: @@ -171,26 +168,20 @@ def add(self, item: Hook): Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - ] - ) - - new_hook = Hook(callback=func2, enabled=False) - hooks.add(new_hook) - - assert hooks == HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=False), - ] - ) + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... ], + ... ) + + >>> new_hook = Hook(callback=func2) + >>> hooks.add(new_hook) + >>> len(hooks.active) + 2 """ self._hooks.append(item) @@ -200,26 +191,20 @@ def extend(self, hooks: Iterable[Hook]): Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - ] - ) - - new_hooks = [Hook(callback=func2, enabled=False)] - hooks.extend(new_hook) - - assert hooks == HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=False), - ] - ) + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... ], + ... ) + + >>> new_hooks = [Hook(callback=func2)] + >>> hooks.extend(new_hooks) + >>> len(hooks.active) + 2 """ self._hooks.extend(hooks) @@ -229,20 +214,20 @@ def __iter__(self): Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=True), - ] - ) - - for hook in hooks: - assert hook.enabled + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1, enabled=True), + ... Hook(callback=func2, enabled=False), + ... ], + ... ) + >>> for hook in hooks: + ... print(hook.enabled) + True + False """ return iter(self._hooks) @@ -252,18 +237,20 @@ def __len__(self): Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=True), - ] - ) - - assert len(hooks) == 2 + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... Hook(callback=func2), + ... ], + ... ) + >>> len(hooks) + 2 """ return len(self._hooks) + + def __repr__(self): + return f"HookCollection({self._hooks})" diff --git a/onetl/hooks/method_inheritance_stack.py b/onetl/hooks/method_inheritance_stack.py index c65a2924e..99fef076c 100644 --- a/onetl/hooks/method_inheritance_stack.py +++ b/onetl/hooks/method_inheritance_stack.py @@ -20,30 +20,28 @@ class MethodInheritanceStack: Examples -------- - .. code:: python - - @support_hooks - class BaseClass: - @slot - def some_method(self, *args, **kwargs): - pass - - - @support_hooks - class MyClass(BaseClass): - @slot - def some_method(self, *args, **kwargs): - self.do_something() - super().some_method(*args, **kwargs) - - - # caused by MyClass.some_method() call - with MethodInheritanceStack(MyClass, "some_method") as method_call_stack: - assert method_call_stack.level == 0 - - # MyClass.some_method() called super().some_method() - with MethodInheritanceStack(BaseClass, "some_method") as method_call_stack: - assert method_call_stack.level == 1 + >>> from onetl.hooks import support_hooks, slot + >>> from onetl.hooks.method_inheritance_stack import MethodInheritanceStack + >>> @support_hooks + ... class BaseClass: + ... @slot + ... def some_method(self, *args, **kwargs): + ... pass + >>> @support_hooks + ... class MyClass(BaseClass): + ... @slot + ... def some_method(self, *args, **kwargs): + ... self.do_something() + ... super().some_method(*args, **kwargs) + + >>> # caused by MyClass.some_method() call + >>> with MethodInheritanceStack(MyClass, "some_method") as method_call_stack: + ... print("MyClass", method_call_stack.level) + ... # MyClass.some_method() called super().some_method() + ... with MethodInheritanceStack(BaseClass, "some_method") as method_call_stack: + ... print("BaseClass", method_call_stack.level) + MyClass 0 + BaseClass 1 """ _stack: dict[type, dict[str, int]] = defaultdict(lambda: defaultdict(int)) diff --git a/onetl/hwm/store/hwm_class_registry.py b/onetl/hwm/store/hwm_class_registry.py index ccd074e40..82b0eef2e 100644 --- a/onetl/hwm/store/hwm_class_registry.py +++ b/onetl/hwm/store/hwm_class_registry.py @@ -13,18 +13,16 @@ class SparkTypeToHWM: Examples -------- - .. code:: python - - from etl_entities.hwm import ColumnIntHWM, ColumnDateHWM - from onetl.hwm.store import SparkTypeToHWM - - assert SparkTypeToHWM.get("integer") == ColumnIntHWM - assert SparkTypeToHWM.get("short") == ColumnIntHWM # multiple type names are supported - - assert SparkTypeToHWM.get("date") == ColumnDateHWM - - assert SparkTypeToHWM.get("unknown") is None - + >>> from etl_entities.hwm import ColumnIntHWM, ColumnDateHWM + >>> from onetl.hwm.store import SparkTypeToHWM + >>> SparkTypeToHWM.get("integer") + + >>> # multiple type names are supported + >>> SparkTypeToHWM.get("short") + + >>> SparkTypeToHWM.get("date") + + >>> SparkTypeToHWM.get("unknown") """ _mapping: ClassVar[dict[str, type[HWM]]] = { @@ -57,20 +55,15 @@ def register_spark_type_to_hwm_type_mapping(*type_names: str): Examples -------- - .. code:: python - - from etl_entities import HWM - from onetl.hwm.store import SparkTypeToHWM - from onetl.hwm.store import SparkTypeToHWM, register_spark_type_to_hwm_type_mapping - - - @register_spark_type_to_hwm_type_mapping("somename", "anothername") - class MyHWM(HWM): ... - - - assert SparkTypeToHWM.get("somename") == MyClass - assert SparkTypeToHWM.get("anothername") == MyClass - + >>> from etl_entities.hwm import ColumnHWM + >>> from onetl.hwm.store import SparkTypeToHWM + >>> from onetl.hwm.store import SparkTypeToHWM, register_spark_type_to_hwm_type_mapping + >>> @register_spark_type_to_hwm_type_mapping("somename", "anothername") + ... class MyHWM(ColumnHWM): ... + >>> SparkTypeToHWM.get("somename") + + >>> SparkTypeToHWM.get("anothername") + """ def wrapper(cls: type[HWM]): diff --git a/onetl/strategy/incremental_strategy.py b/onetl/strategy/incremental_strategy.py index 267b01613..d6999e96e 100644 --- a/onetl/strategy/incremental_strategy.py +++ b/onetl/strategy/incremental_strategy.py @@ -75,21 +75,26 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): .. code:: python - assert download_result == DownloadResult( - successful=[ - "/path/my/file1", - "/path/my/file2", - ] + DownloadResult( + ..., + successful={ + LocalFile("/downloaded/file1"), + LocalFile("/downloaded/file2"), + }, ) Then the downloaded files list is saved as ``FileListHWM`` object into :ref:`HWM Store `: .. code:: python - [ - "/path/my/file1", - "/path/my/file2", - ] + FileListHWM( + ..., + entity="/path", + value=[ + "/path/my/file1", + "/path/my/file2", + ], + ) Next incremental run will download only new files from the source: @@ -104,22 +109,26 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): .. code:: python # only files which are not in FileListHWM - - assert download_result == DownloadResult( - successful=[ - "/path/my/file3", - ] + DownloadResult( + ..., + successful={ + LocalFile("/downloaded/file3"), + }, ) New files will be added to the ``FileListHWM`` and saved to :ref:`HWM Store `: .. code:: python - [ - "/path/my/file1", - "/path/my/file2", - "/path/my/file3", - ] + FileListHWM( + ..., + entity="/path", + value=[ + "/path/my/file1", + "/path/my/file2", + "/path/my/file3", + ], + ) .. warning:: diff --git a/onetl/strategy/snapshot_strategy.py b/onetl/strategy/snapshot_strategy.py index 5368b2039..cf5a0d154 100644 --- a/onetl/strategy/snapshot_strategy.py +++ b/onetl/strategy/snapshot_strategy.py @@ -38,11 +38,12 @@ class SnapshotStrategy(BaseStrategy): .. code:: python - assert download_result == DownloadResult( - successful=[ - "/path/my/file1", - "/path/my/file2", - ] + DownloadResult( + ..., + successful={ + LocalFile("/downloaded/file1"), + LocalFile("/downloaded/file2"), + }, ) Examples diff --git a/setup.cfg b/setup.cfg index d1f770d78..b72ff1b1d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -273,7 +273,9 @@ ignore = # E704 multiple statements on one line: def func(): ... E704, # WPS474 Found import object collision - WPS474 + WPS474, +# WPS318 Found extra indentation + WPS318 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = diff --git a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py index de2f9e522..b72f8ac1b 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py @@ -213,7 +213,7 @@ def table_finalizer(): SELECT * FROM {table_name} WHERE id_int >= 50 RETURNING id_int{suffix} - """, + """, ) df = postgres.fetch(f"SELECT * FROM {temp_table}{suffix}") @@ -302,7 +302,7 @@ def test_postgres_connection_execute_procedure( AS $$ SELECT COUNT(*) FROM {table}; $${suffix} - """, + """, ) def proc_finalizer(): @@ -358,7 +358,7 @@ def proc_finalizer(): AS $$ SELECT COUNT(*) FROM {table}; $${suffix} - """, + """, ) with pytest.raises(Exception): @@ -412,7 +412,7 @@ def test_postgres_connection_execute_procedure_arguments( SELECT COUNT(*) FROM {table} WHERE id_int = idd; $${suffix} - """, + """, ) def proc_finalizer(): @@ -471,7 +471,7 @@ def test_postgres_connection_execute_procedure_inout( WHERE id_int < idd; END $${suffix} - """, + """, ) def proc_finalizer(): @@ -518,7 +518,7 @@ def test_postgres_connection_execute_procedure_ddl( AS $$ CREATE TABLE {table} (iid INT, text VARCHAR(400)); $${suffix} - """, + """, ) def proc_finalizer(): @@ -565,7 +565,7 @@ def table_finalizer(): AS $$ INSERT INTO {table} VALUES(idd, text); $${suffix} - """, + """, ) def proc_finalizer(): @@ -605,7 +605,7 @@ def test_postgres_connection_execute_function( RETURN 100; END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) def function_finalizer(): @@ -667,7 +667,7 @@ def function_finalizer(): RETURN 100; END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) # replace @@ -681,7 +681,7 @@ def function_finalizer(): RETURN 100; END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) # missing @@ -706,7 +706,7 @@ def function_finalizer(): RETURN 100 END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) @@ -746,7 +746,7 @@ def test_postgres_connection_execute_function_arguments( RETURN i*100; END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) def function_finalizer(): @@ -825,7 +825,7 @@ def test_postgres_connection_execute_function_table( FROM {table} WHERE id_int < i; $$ LANGUAGE SQL{suffix} - """, + """, ) def function_finalizer(): @@ -873,7 +873,7 @@ def test_postgres_connection_execute_function_ddl( RETURN 1; END; $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) def function_finalizer(): @@ -941,7 +941,7 @@ def table_finalizer(): RETURN idd; END; $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) def function_finalizer(): From e9788fdc9373ee18d98114227f767b6ea059b34a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 22 May 2024 09:18:45 +0000 Subject: [PATCH 55/71] [DOP-14058] Improve Kafka documentation --- .../next_release/276.improvement.rst | 4 + docs/conf.py | 1 + .../db_connection/kafka/format_handling.rst | 297 ------------------ docs/connection/db_connection/kafka/index.rst | 8 +- .../db_connection/kafka/prerequisites.rst | 70 +++++ docs/connection/db_connection/kafka/read.rst | 176 +++++++---- .../db_connection/kafka/troubleshooting.rst | 13 + docs/connection/db_connection/kafka/write.rst | 114 ++++--- .../db_connection/kafka/connection.py | 10 +- .../kafka/kafka_kerberos_auth.py | 2 +- .../db_connection/kafka/kafka_scram_auth.py | 4 +- .../db_connection/kafka/kafka_ssl_protocol.py | 8 +- onetl/file/format/avro.py | 132 +++++--- onetl/file/format/csv.py | 97 ++++-- onetl/file/format/json.py | 106 +++++-- onetl/file/format/xml.py | 84 ++--- 16 files changed, 567 insertions(+), 559 deletions(-) create mode 100644 docs/changelog/next_release/276.improvement.rst delete mode 100644 docs/connection/db_connection/kafka/format_handling.rst create mode 100644 docs/connection/db_connection/kafka/prerequisites.rst create mode 100644 docs/connection/db_connection/kafka/troubleshooting.rst diff --git a/docs/changelog/next_release/276.improvement.rst b/docs/changelog/next_release/276.improvement.rst new file mode 100644 index 000000000..4e387bd5a --- /dev/null +++ b/docs/changelog/next_release/276.improvement.rst @@ -0,0 +1,4 @@ +Improve Kafka documentation: + * Add "Prerequisites" page describing different aspects of connecting to Kafka + * Improve "Reading from" and "Writing to" page of Kafka documentation, add more examples and usage notes. + * Add "Troubleshooting" page diff --git a/docs/conf.py b/docs/conf.py index d6b1839e2..9427e1902 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,6 +58,7 @@ "sphinxcontrib.plantuml", "sphinx.ext.extlinks", "sphinx_favicon", + "sphinxcontrib.autodoc_pydantic", ] numpydoc_show_class_members = False autodoc_pydantic_model_show_config = False diff --git a/docs/connection/db_connection/kafka/format_handling.rst b/docs/connection/db_connection/kafka/format_handling.rst deleted file mode 100644 index 5f2d00864..000000000 --- a/docs/connection/db_connection/kafka/format_handling.rst +++ /dev/null @@ -1,297 +0,0 @@ -.. _kafka-data-format-handling: - -Data Format Handling --------------------- - -Kafka topics can store data in various formats including ``JSON``, ``CSV``, ``Avro``, etc. Below are examples of how to handle data formats using custom methods for parsing and serialization integrated with Spark's DataFrame operations. - -CSV Format Handling -------------------- - -``DBReader`` -~~~~~~~~~~~~ - -To handle CSV formatted data stored in Kafka topics, use the :obj:`CSV.parse_column ` method. This method allows you to convert a CSV string column directly into a structured Spark DataFrame using a specified schema. - -.. code-block:: python - - from pyspark.sql import SparkSession - from pyspark.sql.types import StructType, StructField, IntegerType, StringType - - from onetl.db import DBReader - from onetl.file.format import CSV - from onetl.connection import Kafka - - spark = SparkSession.builder.appName("KafkaCSVExample").getOrCreate() - - kafka = Kafka(addresses=["kafka-broker1:9092"], cluster="example-cluster", spark=spark) - csv = CSV(sep=",", encoding="utf-8") - - reader = DBReader( - connection=kafka, - topic="topic_name", - ) - df = reader.run() - - df.show() - # +----+--------+--------+---------+------+-----------------------+-------------+ - # |key |value |topic |partition|offset|timestamp |timestampType| - # +----+--------+--------+---------+------+-----------------------+-------------+ - # |[31]|Alice,20|topicCSV|0 |0 |2024-04-24 13:02:25.911|0 | - # |[32]|Bob,25 |topicCSV|0 |1 |2024-04-24 13:02:25.922|0 | - # +----+--------+--------+---------+------+-----------------------+-------------+ - - # schema for parsing CSV data from Kafka - csv_schema = StructType( - [ - StructField("name", StringType(), nullable=True), - StructField("age", IntegerType(), nullable=True), - ] - ) - - parsed_df = df.select(csv.parse_column("value", csv_schema)) - parse_df.select("value").first() - # Row(value=Row(name='Alice', age=20)) - -``DBWriter`` -~~~~~~~~~~~~ - -To serialize structured data into CSV format and write it back to a Kafka topic, use the :obj:`CSV.serialize_column ` method. - -.. code-block:: python - - from onetl.db import DBWriter - from onetl.file.format import CSV - from onetl.connection import Kafka - - kafka = Kafka(...) - csv = CSV(sep=",", encoding="utf-8") - - df.select("value").show() - # +------------+ - # |value | - # +------------+ - # |{Alice, 20} | - # |{Bob, 25} | - # +------------+ - - - # serializing data parsed in reading example into CSV format - serialized_df = df.select(csv.serialize_column("value")) - - writer = DBWriter(connection=kafka, topic="topic_name") - writer.run(serialized_df) - - - serialized_df.show() - # +---+-----------+ - # |key|value | - # +---+-----------+ - # | 1|"Alice,20" | - # | 2|"Bob,25" | - # +---+-----------+ - -JSON Format Handling --------------------- - -``DBReader`` -~~~~~~~~~~~~ - -To process JSON formatted data from Kafka, use the :obj:`JSON.parse_column ` method. - -.. code-block:: python - - from onetl.file.format import JSON - - df.show() - # +----+-------------------------+----------+---------+------+-----------------------+-------------+ - # |key |value |topic |partition|offset|timestamp |timestampType| - # +----+-------------------------+----------+---------+------+-----------------------+-------------+ - # |[31]|{"name":"Alice","age":20}|topicKafka|0 |0 |2024-04-24 16:51:11.739|0 | - # |[32]|{"name":"Bob","age":25} |topicKafka|0 |1 |2024-04-24 16:51:11.749|0 | - # +----+-------------------------+----------+---------+------+-----------------------+-------------+ - - json = JSON() - - json_schema = StructType( - [ - StructField("name", StringType(), nullable=True), - StructField("age", IntegerType(), nullable=True), - ] - ) - - parsed_json_df = df.select(json.parse_column("value", json_schema)) - - parsed_json_df.first() - # Row(value=Row(name='Alice', age=20)) - -``DBWriter`` -~~~~~~~~~~~~ - -For serializing data into JSON format and sending it back to Kafka, use the :obj:`JSON.serialize_column `. - -.. code-block:: python - - from onetl.file.format import JSON - - df.show() - # +-----------+ - # |value | - # +-----------+ - # |{Alice, 20}| - # |{Bob, 25} | - # +-----------+ - - json = JSON() - - serialized_json_df = df.select(json.serialize_column("data_column")) - serialized_json_df.show() - # +-------------------------+ - # |value | - # +-------------------------+ - # |{"name":"Alice","age":20}| - # |{"name":"Bob","age":25} | - # +-------------------------+ - -Avro Format Handling --------------------- - -``DBReader`` -~~~~~~~~~~~~ - -To process Avro formatted data from Kafka, use the :obj:`Avro.parse_column ` method. This method allows you to convert a column containing Avro binary data directly into a structured Spark DataFrame using a predefined schema. - -.. code-block:: python - - from pyspark.sql import SparkSession - from pyspark.sql.types import StructType, StructField, IntegerType, StringType - - from onetl.db import DBReader - from onetl.file.format import Avro - from onetl.connection import Kafka - - spark = SparkSession.builder.appName("KafkaAvroExample").getOrCreate() - - kafka = Kafka(...) - avro = Avro( - schema_dict={ - "type": "record", - "name": "Person", - "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}], - } - ) - - reader = DBReader( - connection=kafka, - topic="topic_name", - ) - df = reader.run() - - df.show() - # +----+------------------------------------+----------+---------+------+-----------------------+-------------+ - # |key |value |topic |partition|offset|timestamp |timestampType| - # +----+------------------------------------+----------+---------+------+-----------------------+-------------+ - # |[31]|[02 02 02 08 76 6... (binary data)] |topicAvro |0 |0 |2024-04-24 13:02:25.911|0 | - # |[32]|[02 04 02 08 76 6... (binary data)] |topicAvro |0 |1 |2024-04-24 13:02:25.922|0 | - # +----+------------------------------------+----------+---------+------+-----------------------+-------------+ - - parsed_df = df.select(avro.parse_column("value")) - parsed_df.show() - # +-----+----+ - # | name| age| - # +-----+----+ - # |Alice| 20| - # | Bob| 25| - # +-----+----+ - -``DBWriter`` -~~~~~~~~~~~~ - -To serialize structured data into Avro format and write it back to a Kafka topic, use the :obj:`Avro.serialize_column ` method. - -.. code-block:: python - - from onetl.db import DBWriter - from onetl.file.format import Avro - from onetl.connection import Kafka - - kafka = Kafka(...) - avro = Avro( - schema_dict={ - "type": "record", - "name": "Person", - "fields": [{"name": "name", "type": "string"}, {"name": "age", "type": "int"}], - } - ) - - df.select("value").show() - # +-----------+ - # |value | - # +-----------+ - # |{Alice, 20}| - # |{Bob, 25} | - # +-----------+ - - # serializing data into Avro format - serialized_df = df.select(avro.serialize_column("value")) - - serialized_df.show() - # +---+------------------------------------+ - # |key|value | - # +---+------------------------------------+ - # | 1|[02 02 02 08 76 6... (binary data)] | - # | 2|[02 04 02 08 76 6... (binary data)] | - # +---+------------------------------------+ - -XML Format Handling -------------------- - -Handling XML data in Kafka involves parsing string representations of XML into structured Spark DataFrame format. - -``DBReader`` -~~~~~~~~~~~~ - -To process XML formatted data from Kafka, use the :obj:`XML.parse_column ` method. This method allows you to convert a column containing XML strings directly into a structured Spark DataFrame using a specified schema. - -.. code-block:: python - - from pyspark.sql import SparkSession - from pyspark.sql.types import StructType, StructField, StringType, IntegerType - - from onetl.db import DBReader - from onetl.file.format import XML - from onetl.connection import Kafka - - spark = SparkSession.builder.appName("KafkaXMLExample").getOrCreate() - - kafka = Kafka(...) - xml = XML(row_tag="person") - - reader = DBReader( - connection=kafka, - topic="topic_name", - ) - df = reader.run() - - df.show() - # +----+--------------------------------------------------------------------------------------------+----------+---------+------+-----------------------+-------------+ - # |key |value |topic |partition|offset|timestamp |timestampType| - # +----+--------------------------------------------------------------------------------------------+----------+---------+------+-----------------------+-------------+ - # |[31]|"Alice20" |topicXML |0 |0 |2024-04-24 13:02:25.911|0 | - # |[32]|"Bob25" |topicXML |0 |1 |2024-04-24 13:02:25.922|0 | - # +----+--------------------------------------------------------------------------------------------+----------+---------+------+-----------------------+-------------+ - - xml_schema = StructType( - [ - StructField("name", StringType(), nullable=True), - StructField("age", IntegerType(), nullable=True), - ] - ) - parsed_xml_df = df.select(xml.parse_column("value", xml_schema)) - parsed_xml_df.show() - # +-----------+ - # |value | - # +-----------+ - # |{Alice, 20}| - # |{Bob, 25} | - # +-----------+ diff --git a/docs/connection/db_connection/kafka/index.rst b/docs/connection/db_connection/kafka/index.rst index 6076c8087..9cef52066 100644 --- a/docs/connection/db_connection/kafka/index.rst +++ b/docs/connection/db_connection/kafka/index.rst @@ -7,7 +7,9 @@ Kafka :maxdepth: 1 :caption: Connection + prerequisites connection + troubleshooting .. toctree:: :maxdepth: 1 @@ -31,12 +33,6 @@ Kafka read write -.. toctree:: - :maxdepth: 1 - :caption: Troubleshooting - - format_handling - .. toctree:: :maxdepth: 1 :caption: For developers diff --git a/docs/connection/db_connection/kafka/prerequisites.rst b/docs/connection/db_connection/kafka/prerequisites.rst new file mode 100644 index 000000000..4df271849 --- /dev/null +++ b/docs/connection/db_connection/kafka/prerequisites.rst @@ -0,0 +1,70 @@ +.. _kafka-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Kafka server versions: 0.10 or higher +* Spark versions: 2.4.x - 3.5.x +* Java versions: 8 - 17 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Kafka connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Kafka +----------------------- + +Connection address +~~~~~~~~~~~~~~~~~~ + +Kafka is a distributed service, and usually has a list of brokers you can connect to (unlike other connectors, there only one host+port can be set). +Please contact your Kafka administrator to get addresses of these brokers, as there are no defaults. + +Also Kafka has a feature called *advertised listeners* - client connects to one broker, and received list of other brokers in the clusters. +So you don't have to pass all brokers to ``addresses``, it can be some subset. Other broker addresses will be fetched directly from the cluster. + +Connection protocol +~~~~~~~~~~~~~~~~~~~ + +Kafka can support different connection protocols. List of currently supported protocols: + * :obj:`PLAINTEXT ` (not secure) + * :obj:`SSL ` (secure, recommended) + +Note that specific port can listen for only one of these protocols, so it is important to set +proper port number + protocol combination. + +Authentication mechanism +~~~~~~~~~~~~~~~~~~~~~~~~ + +Kafka can support different authentication mechanism (also known as `SASL `_). +List of currently supported mechanisms: + * :obj:`PLAIN `. To no confuse this with ``PLAINTEXT`` connection protocol, onETL uses name ``BasicAuth``. + * :obj:`GSSAPI `. To simplify naming, onETL uses name ``KerberosAuth``. + * :obj:`SCRAM-SHA-256 or SCRAM-SHA-512 ` (recommended). + +Different mechanisms use different types of credentials (login + password, keytab file, and so on). + +Note that connection protocol and auth mechanism are set in pairs: + * If you see ``SASL_PLAINTEXT`` this means ``PLAINTEXT`` connection protocol + some auth mechanism. + * If you see ``SASL_SSL`` this means ``SSL`` connection protocol + some auth mechanism. + * If you see just ``PLAINTEXT`` or ``SSL`` (**no** ``SASL``), this means that authentication is disabled (anonymous access). + +Please contact your Kafka administrator to get details about enabled auth mechanism in a specific Kafka instance. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Kafka administrator to set following grants for a user, *if Kafka instance uses ACL*: + * ``Describe`` + ``Read`` for reading data from Kafka (Consumer). + * ``Describe`` + ``Write`` for writing data from Kafka (Producer). + +More details can be found in `documentation `_. diff --git a/docs/connection/db_connection/kafka/read.rst b/docs/connection/db_connection/kafka/read.rst index 8b2917943..bab231ffb 100644 --- a/docs/connection/db_connection/kafka/read.rst +++ b/docs/connection/db_connection/kafka/read.rst @@ -3,62 +3,132 @@ Reading from Kafka ================== -For reading data from Kafka, use :obj:`DBReader ` with specific options (see below). +Data can be read from Kafka to Spark using :obj:`DBReader `. +It also supports :ref:`strategy` for incremental data reading. + +Supported DBReader features +--------------------------- + +* ❌ ``columns`` (is not supported by Kafka) +* ❌ ``where`` (is not supported by Kafka) +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ❌ :ref:`snapshot-batch-strategy` +* * ❌ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Kafka) +* ❌ ``df_schema`` (see note below) +* ✅︎ ``options`` (see :obj:`KafkaReadOptions `) + +Dataframe schema +---------------- + +Unlike other DB connections, Kafka does not have concept of columns. +All the topics messages have the same set of fields, see structure below: + +.. code:: text + + root + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- topic: string (nullable = false) + |-- partition: integer (nullable = false) + |-- offset: integer (nullable = false) + |-- timestamp: timestamp (nullable = false) + |-- timestampType: integer (nullable = false) + |-- headers: struct (nullable = true) + |-- key: string (nullable = false) + |-- value: binary (nullable = true) + +``headers`` field is present in the dataframe only if ``Kafka.ReadOptions(include_headers=True)`` is passed (compatibility with Kafka 1.x). + + +Value deserialization +--------------------- + +To read ``value`` or ``key`` of other type than bytes (e.g. struct or integer), users have to deserialize values manually. + +This could be done using following methods: + * :obj:`Avro.parse_column ` + * :obj:`JSON.parse_column ` + * :obj:`CSV.parse_column ` + * :obj:`XML.parse_column ` + +Examples +-------- + +Snapshot strategy, ``value`` is Avro binary data: + +.. code-block:: python + + from onetl.connection import Kafka + from onetl.db import DBReader, DBWriter + from onetl.file.format import Avro + from pyspark.sql.functions import decode + + # read all topic data from Kafka + kafka = Kafka(...) + reader = DBReader(connection=kafka, source="avro_topic") + read_df = reader.run() + + # parse Avro format to Spark struct + avro = Avro( + schema_dict={ + "type": "record", + "name": "Person", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"}, + ], + } + ) + deserialized_df = read_df.select( + # cast binary key to string + decode("key", "UTF-8").alias("key"), + avro.parse_column("value"), + ) + +Incremental strategy, ``value`` is JSON string: .. note:: - Unlike other connection classes, Kafka always return dataframe with fixed schema - (see `documentation `_): - - .. dropdown:: DataFrame Schema - - .. code:: python - - from pyspark.sql.types import ( - ArrayType, - BinaryType, - IntegerType, - LongType, - StringType, - StructField, - StructType, - TimestampType, - ) - - schema = StructType( - [ - StructField("value", BinaryType(), nullable=True), - StructField("key", BinaryType(), nullable=True), - StructField("topic", StringType(), nullable=False), - StructField("partition", IntegerType(), nullable=False), - StructField("offset", LongType(), nullable=False), - StructField("timestamp", TimestampType(), nullable=False), - StructField("timestampType", IntegerType(), nullable=False), - # this field is returned only with ``include_headers=True`` - StructField( - "headers", - ArrayType( - StructType( - [ - StructField("key", StringType(), nullable=False), - StructField("value", BinaryType(), nullable=True), - ], - ), - ), - nullable=True, - ), - ], - ) - -.. warning:: - - Columns: - - * ``value`` - * ``key`` - * ``headers[*].value`` - - are always returned as raw bytes. If they contain values of custom type, these values should be deserialized manually. + Currently Kafka connector does support only HWMs based on ``offset`` field. Other fields, like ``timestamp``, are not yet supported. + +.. code-block:: python + + from onetl.connection import Kafka + from onetl.db import DBReader, DBWriter + from onetl.file.format import JSON + from pyspark.sql.functions import decode + + kafka = Kafka(...) + + # read only new data from Kafka topic + reader = DBReader( + connection=kafka, + source="topic_name", + hwm=DBReader.AutoDetectHWM(name="kafka_hwm", expression="offset"), + ) + + with IncrementalStrategy(): + read_df = reader.run() + + # parse JSON format to Spark struct + json = JSON() + schema = StructType( + [ + StructField("name", StringType(), nullable=True), + StructField("age", IntegerType(), nullable=True), + ], + ) + deserialized_df = read_df.select( + # cast binary key to string + decode("key", "UTF-8").alias("key"), + json.parse_column("value", json), + ) + +Options +------- .. currentmodule:: onetl.connection.db_connection.kafka.options diff --git a/docs/connection/db_connection/kafka/troubleshooting.rst b/docs/connection/db_connection/kafka/troubleshooting.rst new file mode 100644 index 000000000..fde6fafa6 --- /dev/null +++ b/docs/connection/db_connection/kafka/troubleshooting.rst @@ -0,0 +1,13 @@ +.. _kafka-troubleshooting: + +Kafka Troubleshooting +===================== + +.. note:: + + General guide: :ref:`troubleshooting`. + +Cannot connect using ``SSL`` protocol +------------------------------------- + +Please check that certificate files are not Base-64 encoded. diff --git a/docs/connection/db_connection/kafka/write.rst b/docs/connection/db_connection/kafka/write.rst index 064c8ead1..6b24aa251 100644 --- a/docs/connection/db_connection/kafka/write.rst +++ b/docs/connection/db_connection/kafka/write.rst @@ -5,58 +5,68 @@ Writing to Kafka For writing data to Kafka, use :obj:`DBWriter ` with specific options (see below). -.. note:: - - Unlike other connection classes, Kafka only accepts dataframe with fixed schema - (see `documentation `_): - - .. dropdown:: DataFrame Schema - - .. code:: python - - from pyspark.sql.types import ( - ArrayType, - BinaryType, - IntegerType, - StringType, - StructField, - StructType, - ) - - schema = StructType( - [ - # mandatory fields: - StructField("value", BinaryType(), nullable=True), - # optional fields, can be omitted: - StructField("key", BinaryType(), nullable=True), - StructField("partition", IntegerType(), nullable=True), - # this field can be passed only with ``include_headers=True`` - StructField( - "headers", - ArrayType( - StructType( - [ - StructField("key", StringType(), nullable=False), - StructField("value", BinaryType(), nullable=True), - ], - ), - ), - nullable=True, - ), - ], - ) - - You cannot pass dataframe with other column names or types. - -.. warning:: - - Columns: - - * ``value`` - * ``key`` - * ``headers[*].value`` - - can only be string or raw bytes. If they contain values of custom type, these values should be serialized manually. +Dataframe schema +---------------- + +Unlike other DB connections, Kafka does not have concept of columns. +All the topics messages have the same set of fields. Only some of them can be written: + +.. code:: text + + root + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- headers: struct (nullable = true) + |-- key: string (nullable = false) + |-- value: binary (nullable = true) + +``headers`` can be passed only with ``Kafka.WriteOptions(include_headers=True)`` (compatibility with Kafka 1.x). + +Field ``topic`` should not be present in the dataframe, as it is passed to ``DBWriter(target=...)``. + +Other fields, like ``partition``, ``offset``, ``timestamp`` are set by Kafka, and cannot be passed explicitly. + +Value serialization +------------------- + +To write ``value`` or ``key`` of other type than bytes (e.g. struct or integer), users have to serialize values manually. + +This could be done using following methods: + * :obj:`Avro.serialize_column ` + * :obj:`JSON.serialize_column ` + * :obj:`CSV.serialize_column ` + +Examples +-------- + +Convert ``value`` to JSON string, and write to Kafka: + +.. code-block:: python + + from onetl.connection import Kafka + from onetl.db import DBWriter + from onetl.file.format import JSON + + df = ... # original data is here + + # serialize struct data as JSON + json = JSON() + write_df = df.select( + df.key, + json.serialize_column(df.value), + ) + + # write data to Kafka + kafka = Kafka(...) + + writer = DBWriter( + connection=kafka, + target="topic_name", + ) + writer.run(write_df) + +Options +------- .. currentmodule:: onetl.connection.db_connection.kafka.options diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index dd834495b..0e5895ac1 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -57,15 +57,13 @@ class Kafka(DBConnection): Based on `official Kafka Source For Spark `_. - .. note:: + .. warning:: - This connector is for batch download from kafka and not streaming. + Before using this connector please take into account :ref:`kafka-prerequisites` - .. dropdown:: Version compatibility + .. note:: - * Apache Kafka versions: 0.10 or higher - * Spark versions: 2.4.x - 3.5.x - * Scala versions: 2.11 - 2.13 + This connector is for **batch** ETL processes, not streaming. Parameters ---------- diff --git a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py index 6f56ca55d..27f889ed1 100644 --- a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py @@ -111,7 +111,7 @@ class KafkaKerberosAuth(KafkaAuth, GenericOptions): # options without sasl.kerberos. prefix are passed to JAAS config # names are in camel case! "isInitiator": True, - # options with sasl.kerberos. prefix are passed to Kafka client config + # options with `sasl.kerberos.` prefix are passed to Kafka client config as-is "sasl.kerberos.kinit.cmd": "/usr/bin/kinit", } ) diff --git a/onetl/connection/db_connection/kafka/kafka_scram_auth.py b/onetl/connection/db_connection/kafka/kafka_scram_auth.py index 2015123ee..ddd9280fa 100644 --- a/onetl/connection/db_connection/kafka/kafka_scram_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_scram_auth.py @@ -21,7 +21,7 @@ class KafkaScramAuth(KafkaAuth, GenericOptions): """ - Connect to Kafka using ``sasl.mechanism="SCRAM-SHA-*"``. + Connect to Kafka using ``sasl.mechanism="SCRAM-SHA-256"`` or ``sasl.mechanism="SCRAM-SHA-512"``. For more details see `Kafka Documentation `_. @@ -51,7 +51,7 @@ class KafkaScramAuth(KafkaAuth, GenericOptions): "user": "me", "password": "abc", "digest": "SHA-512", - # options with sasl.login. prefix are passed to Kafka client config + # options with `sasl.login.` prefix are passed to Kafka client config as-is "sasl.login.class": "com.example.CustomScramLogin", } ) diff --git a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py index f7b9ecc15..96e9d89b2 100644 --- a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py @@ -66,7 +66,7 @@ class KafkaSSLProtocol(KafkaProtocol, GenericOptions): protocol = Kafka.SSLProtocol.parse( { - # Just the same options as above, but using Kafka config naming + # Just the same options as above, but using Kafka config naming with dots "ssl.keystore.type": "PEM", "ssl.keystore.certificate_chain": "-----BEGIN CERTIFICATE-----\\nMIIDZjC...\\n-----END CERTIFICATE-----", "ssl.keystore.key": "-----BEGIN PRIVATE KEY-----\\nMIIEvg..\\n-----END PRIVATE KEY-----", @@ -79,7 +79,9 @@ class KafkaSSLProtocol(KafkaProtocol, GenericOptions): .. dropdown :: Not recommended - Pass PEM certificates as files: + These options are error-prone and have several drawbacks, so it is not recommended to use them. + + Passing PEM certificates as files: * ENCRYPT ``user.key`` file with password ``"some password"`` `using PKCS#8 scheme `_. * Save encrypted key to file ``/path/to/user/encrypted_key_with_certificate_chain.pem``. @@ -97,7 +99,7 @@ class KafkaSSLProtocol(KafkaProtocol, GenericOptions): truststore_location="/path/to/server.crt", ) - Pass JKS (Java Key Store) location: + Passing JKS (Java Key Store) location: * `Add user key and certificate to JKS keystore `_. * `Add server certificate to JKS truststore `_. diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index e07c66f51..7cfae4408 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -98,7 +98,10 @@ class Avro(ReadWriteFileFormat): schema = { "type": "record", "name": "Person", - "fields": [{"name": "name", "type": "string"}], + "fields": [ + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"}, + ], } avro = Avro(schema_dict=schema, compression="snappy") @@ -210,12 +213,12 @@ def parse_column(self, column: str | Column) -> Column: Parameters ---------- column : str | Column - The name of the column or the Column object containing Avro binary data to parse. + The name of the column or the column object containing Avro bytes to deserialize. + Schema should match the provided Avro schema. Returns ------- - Column - A new Column object with data parsed from Avro binary to the specified structured format. + Column with deserialized data. Schema is matching the provided Avro schema. Column name is the same as input column. Raises ------ @@ -224,27 +227,51 @@ def parse_column(self, column: str | Column) -> Column: ImportError If ``schema_url`` is used and the ``requests`` library is not installed. - Examples -------- - .. code:: python - - from pyspark.sql import SparkSession - - from onetl.file.format import Avro - - spark = SparkSession.builder.appName("AvroParsingExample").getOrCreate() - schema_dict = { - "type": "record", - "name": "Person", - "fields": [{"name": "name", "type": "string"}], - } - avro = Avro(schema_dict=schema_dict) - df = spark.createDataFrame([("bytes_data_here",)], ["avro_data"]) - - parsed_df = df.select(avro.parse_column("avro_data")) - parsed_df.show() + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import Avro + >>> df.show() + +----+----------------------+----------+---------+------+-----------------------+-------------+ + |key |value |topic |partition|offset|timestamp |timestampType| + +----+----------------------+----------+---------+------+-----------------------+-------------+ + |[31]|[0A 41 6C 69 63 65 28]|topicAvro |0 |0 |2024-04-24 13:02:25.911|0 | + |[32]|[06 42 6F 62 32] |topicAvro |0 |1 |2024-04-24 13:02:25.922|0 | + +----+----------------------+----------+---------+------+-----------------------+-------------+ + >>> df.printSchema() + root + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- topic: string (nullable = true) + |-- partition: integer (nullable = true) + |-- offset: integer (nullable = true) + |-- timestamp: timestamp (nullable = true) + |-- timestampType: integer (nullable = true) + >>> avro = Avro( + ... schema_dict={ + ... "type": "record", + ... "name": "Person", + ... "fields": [ + ... {"name": "name", "type": "string"}, + ... {"name": "age", "type": "int"}, + ... ], + ... } + ... ) + >>> parsed_df = df.select(decode("key", "UTF-8").alias("key"), avro.parse_column("value")) + >>> parsed_df.show(truncate=False) + +---+-----------+ + |key|value | + +---+-----------+ + |1 |{Alice, 20}| + |2 |{Bob, 25} | + +---+-----------+ + >>> parsed_df.printSchema() + root + |-- key: string (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) """ from pyspark.sql import Column, SparkSession # noqa: WPS442 from pyspark.sql.functions import col @@ -286,12 +313,11 @@ def serialize_column(self, column: str | Column) -> Column: Parameters ---------- column : str | Column - The name of the column or the Column object containing the data to serialize to Avro format. + The name of the column or the column object containing the data to serialize to Avro format. Returns ------- - Column - A new Column object with data serialized from Spark SQL structures to Avro binary. + Column with binary Avro data. Column name is the same as input column. Raises ------ @@ -302,25 +328,45 @@ def serialize_column(self, column: str | Column) -> Column: Examples -------- - .. code:: python - - from pyspark.sql import SparkSession - - from onetl.file.format import Avro - - spark = SparkSession.builder.appName("AvroSerializationExample").getOrCreate() - schema_dict = { - "type": "record", - "name": "Person", - "fields": [{"name": "id", "type": "long"}, {"name": "name", "type": "string"}], - } - - avro = Avro(schema_dict=schema_dict) - df = spark.createDataFrame([(1, "John Doe"), (2, "Jane Doe")], ["id", "name"]) - - serialized_df = df.select(avro.serialize_column("name")) - serialized_df.show() + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import Avro + >>> df.show() + +---+-----------+ + |key|value | + +---+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +---+-----------+ + >>> df.printSchema() + root + |-- key: string (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + >>> # serializing data into Avro format + >>> avro = Avro( + ... schema_dict={ + ... "type": "record", + ... "name": "Person", + ... "fields": [ + ... {"name": "name", "type": "string"}, + ... {"name": "age", "type": "int"}, + ... ], + ... } + ... ) + >>> serialized_df = df.select("key", avro.serialize_column("value")) + >>> serialized_df.show(truncate=False) + +---+----------------------+ + |key|value | + +---+----------------------+ + | 1|[0A 41 6C 69 63 65 28]| + | 2|[06 42 6F 62 32] | + +---+----------------------+ + >>> serialized_df.printSchema() + root + |-- key: string (nullable = true) + |-- value: binary (nullable = true) """ from pyspark.sql import Column, SparkSession # noqa: WPS442 from pyspark.sql.functions import col diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index 8b8f1922b..9ed1579cb 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -126,32 +126,52 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: Parameters ---------- column : str | Column - The name of the column or the Column object containing CSV strings to parse. + The name of the column or the column object containing CSV strings/bytes to parse. schema : StructType - The schema to apply when parsing the CSV data. This defines the structure of the output DataFrame CSV column. + The schema to apply when parsing the CSV data. This defines the structure of the output DataFrame column. Returns ------- - Column - A new Column object with data parsed from CSV string to the specified CSV structured format. + Column with deserialized data, with the same structure as the provided schema. Column name is the same as input column. Examples -------- - .. code:: python - from pyspark.sql import SparkSession - from pyspark.sql.types import StructType, StructField, IntegerType, StringType - - spark = SparkSession.builder.appName("CSVParsingExample").getOrCreate() - csv = CSV() - df = spark.createDataFrame([("1,some",), ("2,another",)], ["csv_string"]) - schema = StructType( - [StructField("id", IntegerType()), StructField("text", StringType())] - ) - - parsed_df = df.select(csv.parse_column("csv_string", schema)) - parsed_df.show() + >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType + >>> from onetl.file.format import CSV + >>> df.show() + +--+--------+ + |id|value | + +--+--------+ + |1 |Alice;20| + |2 |Bob;25 | + +--+--------+ + >>> df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: string (nullable = true) + >>> csv = CSV(delimiter=";") + >>> csv_schema = StructType( + ... [ + ... StructField("name", StringType(), nullable=True), + ... StructField("age", IntegerType(), nullable=True), + ... ], + ... ) + >>> parsed_df = df.select("id", csv.parse_column("value", csv_schema)) + >>> parsed_df.show() + +--+-----------+ + |id|value | + +--+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +--+-----------+ + >>> parsed_df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) """ from pyspark.sql import Column, SparkSession # noqa: WPS442 @@ -188,23 +208,40 @@ def serialize_column(self, column: str | Column) -> Column: Returns ------- - Column - A new Column object with data serialized from Spark SQL structures to CSV string. + Column with string CSV data. Column name is the same as input column. Examples -------- - .. code:: python - - from pyspark.sql import SparkSession - from pyspark.sql.functions import struct - - spark = SparkSession.builder.appName("CSVSerializationExample").getOrCreate() - csv = CSV() - df = spark.createDataFrame([(123, "John")], ["id", "name"]) - df = df.withColumn("combined", struct("id", "name")) - serialized_df = df.select(csv.serialize_column("combined")) - serialized_df.show() + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import CSV + >>> df.show() + +--+-----------+ + |id|value | + +--+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +--+-----------+ + >>> df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + >>> # serializing data into CSV format + >>> csv = CSV(delimiter=";") + >>> serialized_df = df.select("id", csv.serialize_column("value")) + >>> serialized_df.show(truncate=False) + +--+--------+ + |id|value | + +--+--------+ + |1 |Alice;20| + |2 |Bob;25 | + +--+--------+ + >>> serialized_df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: string (nullable = true) """ from pyspark.sql import Column, SparkSession # noqa: WPS442 diff --git a/onetl/file/format/json.py b/onetl/file/format/json.py index 455cf29a0..630865232 100644 --- a/onetl/file/format/json.py +++ b/onetl/file/format/json.py @@ -113,32 +113,58 @@ def parse_column(self, column: str | Column, schema: StructType | ArrayType | Ma Parameters ---------- column : str | Column - The name of the column or the Column object containing JSON strings to parse. + The name of the column or the column object containing JSON strings/bytes to parse. schema : StructType | ArrayType | MapType The schema to apply when parsing the JSON data. This defines the structure of the output DataFrame column. Returns ------- - Column - A new Column object with data parsed from JSON string to the specified structure. + Column with deserialized data, with the same structure as the provided schema. Column name is the same as input column. Examples -------- - .. code:: python - from pyspark.sql import SparkSession - from pyspark.sql.types import StructType, StructField, IntegerType, StringType - - spark = SparkSession.builder.appName("JSONParsingExample").getOrCreate() - json = JSON() - df = spark.createDataFrame([(1, '{"id":123, "name":"John"}')], ["id", "json_string"]) - schema = StructType( - [StructField("id", IntegerType()), StructField("name", StringType())] - ) - - parsed_df = df.select(json.parse_column("json_string", schema)) - parsed_df.show() + >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import JSON + >>> df.show() + +----+--------------------+----------+---------+------+-----------------------+-------------+ + |key |value |topic |partition|offset|timestamp |timestampType| + +----+--------------------+----------+---------+------+-----------------------+-------------+ + |[31]|[7B 22 6E 61 6D 6...|topicJSON |0 |0 |2024-04-24 16:51:11.739|0 | + |[32]|[7B 22 6E 61 6D 6...|topicJSON |0 |1 |2024-04-24 16:51:11.749|0 | + +----+--------------------+----------+---------+------+-----------------------+-------------+ + >>> df.printSchema() + root + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- topic: string (nullable = true) + |-- partition: integer (nullable = true) + |-- offset: integer (nullable = true) + |-- timestamp: timestamp (nullable = true) + |-- timestampType: integer (nullable = true) + >>> json = JSON() + >>> json_schema = StructType( + ... [ + ... StructField("name", StringType(), nullable=True), + ... StructField("age", IntegerType(), nullable=True), + ... ], + ... ) + >>> parsed_df = df.select(decode("key", "UTF-8").alias("key"), json.parse_column("value", json_schema)) + >>> parsed_df.show() + +---+-----------+ + |key|value | + +---+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +---+-----------+ + >>> parsed_df.printSchema() + root + |-- key: string (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) """ from pyspark.sql import Column, SparkSession # noqa: WPS442 from pyspark.sql.functions import col, from_json @@ -155,32 +181,50 @@ def parse_column(self, column: str | Column, schema: StructType | ArrayType | Ma def serialize_column(self, column: str | Column) -> Column: """ - Serializes a structured Spark SQL column into a JSON string column using Spark's `to_json `_ function. + Serializes a structured Spark SQL column into a JSON string column using Spark's + `to_json `_ function. Parameters ---------- column : str | Column - The name of the column or the Column object containing the data to serialize to JSON. + The name of the column or the column object containing the data to serialize to JSON format. Returns ------- - Column - A new Column object with data serialized from Spark SQL structures to JSON string. + Column with string JSON data. Column name is the same as input column. Examples -------- - .. code:: python - - from pyspark.sql import SparkSession - from pyspark.sql.functions import struct - - spark = SparkSession.builder.appName("JSONSerializationExample").getOrCreate() - json = JSON() - df = spark.createDataFrame([(123, "John")], ["id", "name"]) - df = df.withColumn("combined", struct("id", "name")) - serialized_df = df.select(json.serialize_column("combined")) - serialized_df.show() + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import JSON + >>> df.show() + +---+-----------+ + |key|value | + +---+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +---+-----------+ + >>> df.printSchema() + root + |-- key: string (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + >>> # serializing data into JSON format + >>> json = JSON() + >>> serialized_df = df.select("key", json.serialize_column("value")) + >>> serialized_df.show(truncate=False) + +---+-------------------------+ + |key|value | + +---+-------------------------+ + | 1|{"name":"Alice","age":20}| + | 2|{"name":"Bob","age":25} | + +---+-------------------------+ + >>> serialized_df.printSchema() + root + |-- key: string (nullable = true) + |-- value: string (nullable = true) """ from pyspark.sql import Column, SparkSession # noqa: WPS442 from pyspark.sql.functions import col, to_json diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index f1dc337b3..1c1f954eb 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -240,7 +240,8 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: .. note:: - This method parses each DataFrame row individually. Therefore, for a specific column, each row must contain exactly one occurrence of the ``rowTag`` specified. If your XML data includes a root tag that encapsulates multiple row tags, you can adjust the schema to use an ``ArrayType`` to keep all child elements under the single root. + This method parses each DataFrame row individually. Therefore, for a specific column, each row must contain exactly one occurrence of the ``rowTag`` specified. + If your XML data includes a root tag that encapsulates multiple row tags, you can adjust the schema to use an ``ArrayType`` to keep all child elements under the single root. .. code-block:: xml @@ -254,7 +255,11 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: .. code-block:: python from pyspark.sql.types import StructType, StructField, ArrayType, StringType + from onetl.file.format import XML + # each DataFrame row has exactly one tag + xml = XML(rowTag="books") + # each tag have multiple tags, so using ArrayType for such field schema = StructType( [ StructField( @@ -264,54 +269,63 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: [ StructField("title", StringType(), True), StructField("author", StringType(), True), - ] - ) + ], + ), ), True, - ) - ] + ), + ], ) Parameters ---------- column : str | Column - The name of the column or the Column object containing XML strings to parse. + The name of the column or the column object containing XML strings/bytes to parse. + + schema : StructType + The schema to apply when parsing the XML data. This defines the structure of the output DataFrame column. Returns ------- - Column - A new Column object with data parsed from XML string to the specified structured format. + Column with deserialized data, with the same structure as the provided schema. Column name is the same as input column. Examples -------- - .. code-block:: python - - from pyspark.sql import SparkSession - from pyspark.sql.types import StructType, StructField, StringType, IntegerType - - from onetl.file.format import XML - - spark = SparkSession.builder.appName("XMLParsingExample").getOrCreate() - schema = StructType( - [ - StructField("author", StringType(), nullable=True), - StructField("title", StringType(), nullable=True), - StructField("genre", StringType(), nullable=True), - StructField("price", IntegerType(), nullable=True), - ] - ) - xml_processor = XML(row_tag="book") - - data = [ - ( - "Austen, JanePride and Prejudiceromance19", - ) - ] - df = spark.createDataFrame(data, ["xml_string"]) - - parsed_df = df.select(xml_processor.parse_column("xml_string", schema=schema)) - parsed_df.show() + >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType + >>> from onetl.file.format import XML + >>> df.show() + +--+------------------------------------------------+ + |id|value | + +--+------------------------------------------------+ + |1 |Alice20| + |2 |Bob25 | + +--+------------------------------------------------+ + >>> df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: string (nullable = true) + >>> xml = XML(rowTag="person") + >>> xml_schema = StructType( + ... [ + ... StructField("name", StringType(), nullable=True), + ... StructField("age", IntegerType(), nullable=True), + ... ], + ... ) + >>> parsed_df = df.select("key", xml.parse_column("value", xml_schema)) + >>> parsed_df.show() + +--+-----------+ + |id|value | + +--+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +--+-----------+ + >>> parsed_df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) """ from pyspark.sql import Column, SparkSession # noqa: WPS442 From 1f73652e3b3b4efbba025e305c38efc5d1b73c84 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Wed, 22 May 2024 16:05:22 +0300 Subject: [PATCH 56/71] [DOP-15764] - add jdbc option classes (#277) * [DOP-15764] - add jdbc option classes * [DOP-15764] - remove generic types * [DOP-15764] - update docs * [DOP-15764] - update docs * [DOP-15764] - add parametrized tests * [DOP-15764] - remove duplicated autodoc --- .../db_connection/clickhouse/execute.rst | 12 +- .../db_connection/clickhouse/read.rst | 7 +- .../db_connection/clickhouse/sql.rst | 8 +- .../db_connection/clickhouse/write.rst | 7 +- .../db_connection/greenplum/execute.rst | 12 +- .../db_connection/greenplum/read.rst | 2 +- docs/connection/db_connection/kafka/read.rst | 2 +- .../connection/db_connection/mongodb/read.rst | 2 +- .../db_connection/mongodb/write.rst | 2 +- .../db_connection/mssql/execute.rst | 13 +- docs/connection/db_connection/mssql/read.rst | 7 +- docs/connection/db_connection/mssql/sql.rst | 10 +- docs/connection/db_connection/mssql/write.rst | 7 +- .../db_connection/mysql/execute.rst | 16 +- docs/connection/db_connection/mysql/read.rst | 9 +- docs/connection/db_connection/mysql/sql.rst | 8 +- docs/connection/db_connection/mysql/write.rst | 7 +- .../db_connection/oracle/execute.rst | 16 +- docs/connection/db_connection/oracle/read.rst | 9 +- docs/connection/db_connection/oracle/sql.rst | 8 +- .../connection/db_connection/oracle/write.rst | 7 +- .../db_connection/postgres/execute.rst | 16 +- .../db_connection/postgres/read.rst | 9 +- .../connection/db_connection/postgres/sql.rst | 8 +- .../db_connection/postgres/write.rst | 7 +- .../db_connection/teradata/execute.rst | 16 +- .../db_connection/teradata/read.rst | 9 +- .../connection/db_connection/teradata/sql.rst | 8 +- .../db_connection/teradata/write.rst | 7 +- .../db_connection/clickhouse/connection.py | 13 ++ .../db_connection/clickhouse/options.py | 33 ++++ .../db_connection/greenplum/connection.py | 11 +- .../db_connection/greenplum/options.py | 17 ++ .../db_connection/mssql/connection.py | 13 ++ .../connection/db_connection/mssql/options.py | 32 ++++ .../db_connection/mysql/connection.py | 13 ++ .../connection/db_connection/mysql/options.py | 33 ++++ .../db_connection/oracle/connection.py | 16 +- .../db_connection/oracle/options.py | 33 ++++ .../db_connection/postgres/connection.py | 13 ++ .../db_connection/postgres/options.py | 32 ++++ .../db_connection/teradata/connection.py | 13 ++ .../db_connection/teradata/options.py | 32 ++++ .../test_db_options_unit.py | 94 +++++++++-- .../test_greenplum_unit.py | 4 +- .../test_jdbc_options_unit.py | 147 ++++++++++++++---- 46 files changed, 635 insertions(+), 165 deletions(-) create mode 100644 onetl/connection/db_connection/clickhouse/options.py create mode 100644 onetl/connection/db_connection/mssql/options.py create mode 100644 onetl/connection/db_connection/mysql/options.py create mode 100644 onetl/connection/db_connection/oracle/options.py create mode 100644 onetl/connection/db_connection/postgres/options.py create mode 100644 onetl/connection/db_connection/teradata/options.py diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index 03e5d5df1..2a8abdb70 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -20,7 +20,7 @@ Use ``Clickhouse.fetch`` Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading Clickhouse config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`FetchOptions `. +Method accepts :obj:`Clickhouse.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -60,7 +60,7 @@ Use ``Clickhouse.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`ExecuteOptions `. +Method accepts :obj:`Clickhouse.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -111,15 +111,17 @@ So it should **NOT** be used to read large amounts of data. Use :ref:`DBReader < Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.clickhouse.options -.. autopydantic_model:: JDBCFetchOptions +.. autopydantic_model:: ClickhouseFetchOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false -.. autopydantic_model:: JDBCExecuteOptions +.. autopydantic_model:: ClickhouseExecuteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/clickhouse/read.rst b/docs/connection/db_connection/clickhouse/read.rst index 33bcff4ce..0b2bc929a 100644 --- a/docs/connection/db_connection/clickhouse/read.rst +++ b/docs/connection/db_connection/clickhouse/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Clickhouse) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`Clickhouse.ReadOptions `) Examples -------- @@ -85,9 +85,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.clickhouse.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: ClickhouseReadOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/clickhouse/sql.rst b/docs/connection/db_connection/clickhouse/sql.rst index a81eda8b9..376b2d0c3 100644 --- a/docs/connection/db_connection/clickhouse/sql.rst +++ b/docs/connection/db_connection/clickhouse/sql.rst @@ -70,6 +70,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.clickhouse.options -.. autopydantic_model:: JDBCSQLOptions +.. autopydantic_model:: ClickhouseSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/clickhouse/write.rst b/docs/connection/db_connection/clickhouse/write.rst index 1fe56868b..6237bdf16 100644 --- a/docs/connection/db_connection/clickhouse/write.rst +++ b/docs/connection/db_connection/clickhouse/write.rst @@ -45,11 +45,12 @@ Examples Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`Clickhouse.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.clickhouse.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: ClickhouseWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index dcc32171b..c0470a396 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -20,7 +20,7 @@ Use ``Greenplum.fetch`` Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading Greenplum config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`FetchOptions `. +Method accepts :obj:`Greenplum.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -60,7 +60,7 @@ Use ``Greenplum.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`ExecuteOptions `. +Method accepts :obj:`Greenplum.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -143,15 +143,17 @@ The only port used while interacting with Greenplum in this case is ``5432`` (Gr Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.greenplum.options -.. autopydantic_model:: JDBCFetchOptions +.. autopydantic_model:: GreenplumFetchOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false -.. autopydantic_model:: JDBCExecuteOptions +.. autopydantic_model:: GreenplumExecuteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index 310521677..1c76c9c72 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -27,7 +27,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Greenplum) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`GreenplumReadOptions `) +* ✅︎ ``options`` (see :obj:`Greenplum.ReadOptions `) .. warning:: diff --git a/docs/connection/db_connection/kafka/read.rst b/docs/connection/db_connection/kafka/read.rst index bab231ffb..292089e06 100644 --- a/docs/connection/db_connection/kafka/read.rst +++ b/docs/connection/db_connection/kafka/read.rst @@ -18,7 +18,7 @@ Supported DBReader features * * ❌ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Kafka) * ❌ ``df_schema`` (see note below) -* ✅︎ ``options`` (see :obj:`KafkaReadOptions `) +* ✅︎ ``options`` (see :obj:`Kafka.ReadOptions `) Dataframe schema ---------------- diff --git a/docs/connection/db_connection/mongodb/read.rst b/docs/connection/db_connection/mongodb/read.rst index e90bc6e2b..860630e11 100644 --- a/docs/connection/db_connection/mongodb/read.rst +++ b/docs/connection/db_connection/mongodb/read.rst @@ -23,7 +23,7 @@ Supported DBReader features * * Note that ``expression`` field of HWM can only be a field name, not a custom expression * ✅︎ ``hint`` (see `official documentation `_) * ✅︎ ``df_schema`` (mandatory) -* ✅︎ ``options`` (see :obj:`MongoDBReadOptions `) +* ✅︎ ``options`` (see :obj:`MongoDB.ReadOptions `) Examples -------- diff --git a/docs/connection/db_connection/mongodb/write.rst b/docs/connection/db_connection/mongodb/write.rst index 3ae86fece..3686a3be7 100644 --- a/docs/connection/db_connection/mongodb/write.rst +++ b/docs/connection/db_connection/mongodb/write.rst @@ -35,7 +35,7 @@ Examples Write options ------------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`MongoDB.WriteOptions ` .. currentmodule:: onetl.connection.db_connection.mongodb.options diff --git a/docs/connection/db_connection/mssql/execute.rst b/docs/connection/db_connection/mssql/execute.rst index b8b795a66..13b348fa9 100644 --- a/docs/connection/db_connection/mssql/execute.rst +++ b/docs/connection/db_connection/mssql/execute.rst @@ -20,7 +20,7 @@ Use ``MSSQL.fetch`` Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading MSSQL config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`FetchOptions `. +Method accepts :obj:`MSSQL.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -59,7 +59,7 @@ Use ``MSSQL.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`ExecuteOptions `. +Method accepts :obj:`MSSQL.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -103,15 +103,16 @@ Examples Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.mssql.options -.. autopydantic_model:: JDBCFetchOptions +.. autopydantic_model:: MSSQLFetchOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false - -.. autopydantic_model:: JDBCExecuteOptions +.. autopydantic_model:: MSSQLExecuteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/mssql/read.rst b/docs/connection/db_connection/mssql/read.rst index 0c8599aea..c15402a42 100644 --- a/docs/connection/db_connection/mssql/read.rst +++ b/docs/connection/db_connection/mssql/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (MSSQL does support hints, but DBReader not, at least for now) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`MSSQL.ReadOptions `) Examples -------- @@ -85,9 +85,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mssql.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: MSSQLReadOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/mssql/sql.rst b/docs/connection/db_connection/mssql/sql.rst index de4f6fe63..e456c6f4e 100644 --- a/docs/connection/db_connection/mssql/sql.rst +++ b/docs/connection/db_connection/mssql/sql.rst @@ -43,7 +43,7 @@ Examples WHERE key = 'something' """, - options=MSSQL.QLOptions( + options=MSSQL.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -70,6 +70,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mssql.options -.. autopydantic_model:: JDBCSQLOptions +.. autopydantic_model:: MSSQLSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/mssql/write.rst b/docs/connection/db_connection/mssql/write.rst index 854283704..75d9cceb2 100644 --- a/docs/connection/db_connection/mssql/write.rst +++ b/docs/connection/db_connection/mssql/write.rst @@ -40,11 +40,12 @@ Examples Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`MSSQL.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mssql.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: MSSQLWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/execute.rst b/docs/connection/db_connection/mysql/execute.rst index de1f17002..477ad6fa7 100644 --- a/docs/connection/db_connection/mysql/execute.rst +++ b/docs/connection/db_connection/mysql/execute.rst @@ -20,7 +20,7 @@ Use ``MySQL.fetch`` Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading MySQL config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`FetchOptions `. +Method accepts :obj:`MySQL.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -60,7 +60,7 @@ Use ``MySQL.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`ExecuteOptions `. +Method accepts :obj:`MySQL.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -104,15 +104,13 @@ Examples Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.mysql.options -.. autopydantic_model:: JDBCFetchOptions +.. autopydantic_model:: MySQLFetchOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false -.. autopydantic_model:: JDBCExecuteOptions +.. autopydantic_model:: MySQLExecuteOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/read.rst b/docs/connection/db_connection/mysql/read.rst index e72da45f1..2618e6859 100644 --- a/docs/connection/db_connection/mysql/read.rst +++ b/docs/connection/db_connection/mysql/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ✅︎ ``hint`` (see `official documentation `_) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`MySQL.ReadOptions `) Examples -------- @@ -87,9 +87,8 @@ Especially if there are indexes for columns used in ``where`` clause. Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mysql.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: MySQLReadOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/sql.rst b/docs/connection/db_connection/mysql/sql.rst index 949fb3aa3..04881bec7 100644 --- a/docs/connection/db_connection/mysql/sql.rst +++ b/docs/connection/db_connection/mysql/sql.rst @@ -71,6 +71,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mysql.options -.. autopydantic_model:: JDBCSQLOptions +.. autopydantic_model:: MySQLSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/write.rst b/docs/connection/db_connection/mysql/write.rst index 2d7c056c9..869ccc7c5 100644 --- a/docs/connection/db_connection/mysql/write.rst +++ b/docs/connection/db_connection/mysql/write.rst @@ -44,11 +44,12 @@ Examples Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`MySQL.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mysql.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: MySQLWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/execute.rst b/docs/connection/db_connection/oracle/execute.rst index f43eb54be..fff504ee9 100644 --- a/docs/connection/db_connection/oracle/execute.rst +++ b/docs/connection/db_connection/oracle/execute.rst @@ -20,7 +20,7 @@ Use ``Oracle.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Oracle config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`FetchOptions `. +Method accepts :obj:`Oracle.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -60,7 +60,7 @@ Use ``Oracle.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`ExecuteOptions `. +Method accepts :obj:`Oracle.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -104,15 +104,13 @@ Examples Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.oracle.options -.. autopydantic_model:: JDBCFetchOptions +.. autopydantic_model:: OracleFetchOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false -.. autopydantic_model:: JDBCExecuteOptions +.. autopydantic_model:: OracleExecuteOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/read.rst b/docs/connection/db_connection/oracle/read.rst index 6592cfc7b..9fd12a035 100644 --- a/docs/connection/db_connection/oracle/read.rst +++ b/docs/connection/db_connection/oracle/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ✅︎ ``hint`` (see `official documentation `_) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`Oracle.ReadOptions `) Examples -------- @@ -87,9 +87,8 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.oracle.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: OracleReadOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/sql.rst b/docs/connection/db_connection/oracle/sql.rst index 5a94daa42..afe46b064 100644 --- a/docs/connection/db_connection/oracle/sql.rst +++ b/docs/connection/db_connection/oracle/sql.rst @@ -71,6 +71,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.oracle.options -.. autopydantic_model:: JDBCSQLOptions +.. autopydantic_model:: OracleSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/write.rst b/docs/connection/db_connection/oracle/write.rst index 5ce0e3b86..2f572e926 100644 --- a/docs/connection/db_connection/oracle/write.rst +++ b/docs/connection/db_connection/oracle/write.rst @@ -40,11 +40,12 @@ Examples Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`OracleWriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.oracle.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: OracleWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/execute.rst b/docs/connection/db_connection/postgres/execute.rst index 753c4f624..8c6fbc858 100644 --- a/docs/connection/db_connection/postgres/execute.rst +++ b/docs/connection/db_connection/postgres/execute.rst @@ -20,7 +20,7 @@ Use ``Postgres.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Postgres config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`FetchOptions `. +Method accepts :obj:`Postgres.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -58,7 +58,7 @@ Use ``Postgres.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`ExecuteOptions `. +Method accepts :obj:`Postgres.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -102,15 +102,13 @@ Examples Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.postgres.options -.. autopydantic_model:: JDBCFetchOptions +.. autopydantic_model:: PostgresFetchOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false -.. autopydantic_model:: JDBCExecuteOptions +.. autopydantic_model:: PostgresExecuteOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/read.rst b/docs/connection/db_connection/postgres/read.rst index 67b5234a2..fa3fe1728 100644 --- a/docs/connection/db_connection/postgres/read.rst +++ b/docs/connection/db_connection/postgres/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Postgres) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`Postgres.ReadOptions `) Examples -------- @@ -85,9 +85,8 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.postgres.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: PostgresReadOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/sql.rst b/docs/connection/db_connection/postgres/sql.rst index f64cf528e..bfa90e689 100644 --- a/docs/connection/db_connection/postgres/sql.rst +++ b/docs/connection/db_connection/postgres/sql.rst @@ -70,6 +70,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.postgres.options -.. autopydantic_model:: JDBCSQLOptions +.. autopydantic_model:: PostgresSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/write.rst b/docs/connection/db_connection/postgres/write.rst index f35edf9b8..b99f0048e 100644 --- a/docs/connection/db_connection/postgres/write.rst +++ b/docs/connection/db_connection/postgres/write.rst @@ -40,11 +40,12 @@ Examples Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`Postgres.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.postgres.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: PostgresWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/execute.rst b/docs/connection/db_connection/teradata/execute.rst index 3d48d0b9a..28be9d35b 100644 --- a/docs/connection/db_connection/teradata/execute.rst +++ b/docs/connection/db_connection/teradata/execute.rst @@ -20,7 +20,7 @@ Use ``Teradata.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Teradata config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`FetchOptions `. +Method accepts :obj:`Teradata.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -55,7 +55,7 @@ Use ``Teradata.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`ExecuteOptions `. +Method accepts :obj:`Teradata.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -101,15 +101,13 @@ Examples Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.teradata.options -.. autopydantic_model:: JDBCFetchOptions +.. autopydantic_model:: TeradataFetchOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false -.. autopydantic_model:: JDBCExecuteOptions +.. autopydantic_model:: TeradataExecuteOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/read.rst b/docs/connection/db_connection/teradata/read.rst index f4cf95bfb..e3b8d4618 100644 --- a/docs/connection/db_connection/teradata/read.rst +++ b/docs/connection/db_connection/teradata/read.rst @@ -18,7 +18,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Teradata) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`Teradata.ReadOptions `) Examples -------- @@ -115,9 +115,8 @@ Prefer using ``partitioning_mode="hash"`` from example above. Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.teradata.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: TeradataReadOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/sql.rst b/docs/connection/db_connection/teradata/sql.rst index 98b03e107..df50251b0 100644 --- a/docs/connection/db_connection/teradata/sql.rst +++ b/docs/connection/db_connection/teradata/sql.rst @@ -68,6 +68,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.teradata.options -.. autopydantic_model:: JDBCSQLOptions +.. autopydantic_model:: TeradataSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/write.rst b/docs/connection/db_connection/teradata/write.rst index 288e79667..aec3844f0 100644 --- a/docs/connection/db_connection/teradata/write.rst +++ b/docs/connection/db_connection/teradata/write.rst @@ -110,11 +110,12 @@ Choosing one of the modes can alter connector behavior. For example: Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`Teradata.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.teradata.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: TeradataWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 89b7ff463..3892c5f94 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -8,6 +8,13 @@ from onetl._util.classproperty import classproperty from onetl._util.version import Version from onetl.connection.db_connection.clickhouse.dialect import ClickhouseDialect +from onetl.connection.db_connection.clickhouse.options import ( + ClickhouseExecuteOptions, + ClickhouseFetchOptions, + ClickhouseReadOptions, + ClickhouseSQLOptions, + ClickhouseWriteOptions, +) from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_mixin import JDBCStatementType from onetl.hooks import slot, support_hooks @@ -104,6 +111,12 @@ class Clickhouse(JDBCConnection): Extra = ClickhouseExtra Dialect = ClickhouseDialect + ReadOptions = ClickhouseReadOptions + WriteOptions = ClickhouseWriteOptions + SQLOptions = ClickhouseSQLOptions + FetchOptions = ClickhouseFetchOptions + ExecuteOptions = ClickhouseExecuteOptions + DRIVER: ClassVar[str] = "com.clickhouse.jdbc.ClickHouseDriver" @slot diff --git a/onetl/connection/db_connection/clickhouse/options.py b/onetl/connection/db_connection/clickhouse/options.py new file mode 100644 index 000000000..54b4558fb --- /dev/null +++ b/onetl/connection/db_connection/clickhouse/options.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class ClickhouseReadOptions(JDBCReadOptions): + pass + + +class ClickhouseWriteOptions(JDBCWriteOptions): + pass + + +class ClickhouseSQLOptions(JDBCSQLOptions): + pass + + +class ClickhouseFetchOptions(JDBCFetchOptions): + pass + + +class ClickhouseExecuteOptions(JDBCExecuteOptions): + pass diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index c9730c44c..77319b3b0 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -26,7 +26,10 @@ ) from onetl.connection.db_connection.greenplum.dialect import GreenplumDialect from onetl.connection.db_connection.greenplum.options import ( + GreenplumExecuteOptions, + GreenplumFetchOptions, GreenplumReadOptions, + GreenplumSQLOptions, GreenplumTableExistBehavior, GreenplumWriteOptions, ) @@ -154,10 +157,14 @@ class Greenplum(JDBCMixin, DBConnection): port: int = 5432 extra: GreenplumExtra = GreenplumExtra() - Extra = GreenplumExtra - Dialect = GreenplumDialect ReadOptions = GreenplumReadOptions WriteOptions = GreenplumWriteOptions + SQLOptions = GreenplumSQLOptions + FetchOptions = GreenplumFetchOptions + ExecuteOptions = GreenplumExecuteOptions + + Extra = GreenplumExtra + Dialect = GreenplumDialect DRIVER: ClassVar[str] = "org.postgresql.Driver" CONNECTIONS_WARNING_LIMIT: ClassVar[int] = 31 diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index e100e35f1..65d275af3 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -11,7 +11,12 @@ except (ImportError, AttributeError): from pydantic import Field, root_validator # type: ignore[no-redef, assignment] +from onetl.connection.db_connection.jdbc_connection.options import JDBCSQLOptions from onetl.connection.db_connection.jdbc_mixin import JDBCOptions +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) # options from which are populated by Greenplum class methods GENERIC_PROHIBITED_OPTIONS = frozenset( @@ -311,3 +316,15 @@ def _mode_is_deprecated(cls, values): stacklevel=3, ) return values + + +class GreenplumSQLOptions(JDBCSQLOptions): + pass + + +class GreenplumFetchOptions(JDBCFetchOptions): + pass + + +class GreenplumExecuteOptions(JDBCExecuteOptions): + pass diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 48143191d..04add7157 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -9,6 +9,13 @@ from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.mssql.dialect import MSSQLDialect +from onetl.connection.db_connection.mssql.options import ( + MSSQLExecuteOptions, + MSSQLFetchOptions, + MSSQLReadOptions, + MSSQLSQLOptions, + MSSQLWriteOptions, +) from onetl.hooks import slot, support_hooks from onetl.impl import GenericOptions @@ -139,6 +146,12 @@ class MSSQL(JDBCConnection): port: int = 1433 extra: MSSQLExtra = MSSQLExtra() + ReadOptions = MSSQLReadOptions + WriteOptions = MSSQLWriteOptions + SQLOptions = MSSQLSQLOptions + FetchOptions = MSSQLFetchOptions + ExecuteOptions = MSSQLExecuteOptions + Extra = MSSQLExtra Dialect = MSSQLDialect diff --git a/onetl/connection/db_connection/mssql/options.py b/onetl/connection/db_connection/mssql/options.py new file mode 100644 index 000000000..5e5a9d11a --- /dev/null +++ b/onetl/connection/db_connection/mssql/options.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class MSSQLReadOptions(JDBCReadOptions): + pass + + +class MSSQLWriteOptions(JDBCWriteOptions): + pass + + +class MSSQLSQLOptions(JDBCSQLOptions): + pass + + +class MSSQLFetchOptions(JDBCFetchOptions): + pass + + +class MSSQLExecuteOptions(JDBCExecuteOptions): + pass diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index a26f8f385..6774870b9 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -9,6 +9,13 @@ from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.mysql.dialect import MySQLDialect +from onetl.connection.db_connection.mysql.options import ( + MySQLExecuteOptions, + MySQLFetchOptions, + MySQLReadOptions, + MySQLSQLOptions, + MySQLWriteOptions, +) from onetl.hooks import slot, support_hooks from onetl.impl.generic_options import GenericOptions @@ -98,6 +105,12 @@ class MySQL(JDBCConnection): database: Optional[str] = None extra: MySQLExtra = MySQLExtra() + ReadOptions = MySQLReadOptions + WriteOptions = MySQLWriteOptions + SQLOptions = MySQLSQLOptions + FetchOptions = MySQLFetchOptions + ExecuteOptions = MySQLExecuteOptions + Extra = MySQLExtra Dialect = MySQLDialect diff --git a/onetl/connection/db_connection/mysql/options.py b/onetl/connection/db_connection/mysql/options.py new file mode 100644 index 000000000..b2bb2b9d3 --- /dev/null +++ b/onetl/connection/db_connection/mysql/options.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class MySQLReadOptions(JDBCReadOptions): + pass + + +class MySQLWriteOptions(JDBCWriteOptions): + pass + + +class MySQLSQLOptions(JDBCSQLOptions): + pass + + +class MySQLFetchOptions(JDBCFetchOptions): + pass + + +class MySQLExecuteOptions(JDBCExecuteOptions): + pass diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index c21618381..28494bdde 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -24,9 +24,17 @@ from onetl.connection.db_connection.jdbc_connection.options import JDBCReadOptions from onetl.connection.db_connection.jdbc_mixin.options import ( JDBCExecuteOptions, + JDBCFetchOptions, JDBCOptions, ) from onetl.connection.db_connection.oracle.dialect import OracleDialect +from onetl.connection.db_connection.oracle.options import ( + OracleExecuteOptions, + OracleFetchOptions, + OracleReadOptions, + OracleSQLOptions, + OracleWriteOptions, +) from onetl.hooks import slot, support_hooks from onetl.hwm import Window from onetl.impl import GenericOptions @@ -172,6 +180,12 @@ class Oracle(JDBCConnection): service_name: Optional[str] = None extra: OracleExtra = OracleExtra() + ReadOptions = OracleReadOptions + WriteOptions = OracleWriteOptions + SQLOptions = OracleSQLOptions + FetchOptions = OracleFetchOptions + ExecuteOptions = OracleExecuteOptions + Extra = OracleExtra Dialect = OracleDialect @@ -339,7 +353,7 @@ def _get_compile_errors( type_name: str, schema: str, object_name: str, - options: JDBCOptions | JDBCExecuteOptions, + options: JDBCExecuteOptions | JDBCFetchOptions, ) -> list[tuple[ErrorPosition, str]]: """ Get compile errors for the object. diff --git a/onetl/connection/db_connection/oracle/options.py b/onetl/connection/db_connection/oracle/options.py new file mode 100644 index 000000000..2e0b4f9c0 --- /dev/null +++ b/onetl/connection/db_connection/oracle/options.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class OracleReadOptions(JDBCReadOptions): + pass + + +class OracleWriteOptions(JDBCWriteOptions): + pass + + +class OracleSQLOptions(JDBCSQLOptions): + pass + + +class OracleFetchOptions(JDBCFetchOptions): + pass + + +class OracleExecuteOptions(JDBCExecuteOptions): + pass diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 2b67d43ec..e78d175a8 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -14,6 +14,13 @@ JDBCOptions, ) from onetl.connection.db_connection.postgres.dialect import PostgresDialect +from onetl.connection.db_connection.postgres.options import ( + PostgresExecuteOptions, + PostgresFetchOptions, + PostgresReadOptions, + PostgresSQLOptions, + PostgresWriteOptions, +) from onetl.hooks import slot, support_hooks from onetl.impl import GenericOptions @@ -107,6 +114,12 @@ class Postgres(JDBCConnection): port: int = 5432 extra: PostgresExtra = PostgresExtra() + ReadOptions = PostgresReadOptions + WriteOptions = PostgresWriteOptions + SQLOptions = PostgresSQLOptions + FetchOptions = PostgresFetchOptions + ExecuteOptions = PostgresExecuteOptions + Extra = PostgresExtra Dialect = PostgresDialect diff --git a/onetl/connection/db_connection/postgres/options.py b/onetl/connection/db_connection/postgres/options.py new file mode 100644 index 000000000..4f7aecd1d --- /dev/null +++ b/onetl/connection/db_connection/postgres/options.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class PostgresReadOptions(JDBCReadOptions): + pass + + +class PostgresWriteOptions(JDBCWriteOptions): + pass + + +class PostgresSQLOptions(JDBCSQLOptions): + pass + + +class PostgresFetchOptions(JDBCFetchOptions): + pass + + +class PostgresExecuteOptions(JDBCExecuteOptions): + pass diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index cf135009d..3f7f884e3 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -10,6 +10,13 @@ from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.teradata.dialect import TeradataDialect +from onetl.connection.db_connection.teradata.options import ( + TeradataExecuteOptions, + TeradataFetchOptions, + TeradataReadOptions, + TeradataSQLOptions, + TeradataWriteOptions, +) from onetl.hooks import slot from onetl.impl import GenericOptions @@ -118,6 +125,12 @@ class Teradata(JDBCConnection): database: Optional[str] = None extra: TeradataExtra = TeradataExtra() + ReadOptions = TeradataReadOptions + WriteOptions = TeradataWriteOptions + SQLOptions = TeradataSQLOptions + FetchOptions = TeradataFetchOptions + ExecuteOptions = TeradataExecuteOptions + Extra = TeradataExtra Dialect = TeradataDialect diff --git a/onetl/connection/db_connection/teradata/options.py b/onetl/connection/db_connection/teradata/options.py new file mode 100644 index 000000000..c71592c47 --- /dev/null +++ b/onetl/connection/db_connection/teradata/options.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class TeradataReadOptions(JDBCReadOptions): + pass + + +class TeradataWriteOptions(JDBCWriteOptions): + pass + + +class TeradataSQLOptions(JDBCSQLOptions): + pass + + +class TeradataFetchOptions(JDBCFetchOptions): + pass + + +class TeradataExecuteOptions(JDBCExecuteOptions): + pass diff --git a/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py index 597dbf0c0..a280d53c7 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py @@ -3,7 +3,16 @@ import pytest -from onetl.connection import Greenplum, Hive, Postgres +from onetl.connection import ( + MSSQL, + Clickhouse, + Greenplum, + Hive, + MySQL, + Oracle, + Postgres, + Teradata, +) pytestmark = [pytest.mark.postgres] @@ -18,6 +27,26 @@ Postgres.Options, Greenplum.ReadOptions, Greenplum.WriteOptions, + Clickhouse.ReadOptions, + Clickhouse.WriteOptions, + Clickhouse.FetchOptions, + Clickhouse.ExecuteOptions, + MSSQL.ReadOptions, + MSSQL.WriteOptions, + MSSQL.FetchOptions, + MSSQL.ExecuteOptions, + MySQL.ReadOptions, + MySQL.WriteOptions, + MySQL.FetchOptions, + MySQL.ExecuteOptions, + Teradata.ReadOptions, + Teradata.WriteOptions, + Teradata.FetchOptions, + Teradata.ExecuteOptions, + Oracle.ReadOptions, + Oracle.WriteOptions, + Oracle.FetchOptions, + Oracle.ExecuteOptions, ], ) @pytest.mark.parametrize( @@ -39,11 +68,21 @@ def test_db_options_connection_parameters_cannot_be_passed(options_class, arg, v [ (Hive.WriteOptions, "HiveWriteOptions", {"if_exists": "replace_overlapping_partitions"}), (Hive.Options, "HiveLegacyOptions", {"if_exists": "replace_overlapping_partitions"}), - (Postgres.ReadOptions, "JDBCReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), - (Postgres.WriteOptions, "JDBCWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (Postgres.ReadOptions, "PostgresReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (Postgres.WriteOptions, "PostgresWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), (Postgres.Options, "JDBCLegacyOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), (Greenplum.ReadOptions, "GreenplumReadOptions", {"partitions": 10}), (Greenplum.WriteOptions, "GreenplumWriteOptions", {"if_exists": "replace_entire_table"}), + (Clickhouse.ReadOptions, "ClickhouseReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (Clickhouse.WriteOptions, "ClickhouseWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (MSSQL.ReadOptions, "MSSQLReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (MSSQL.WriteOptions, "MSSQLWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (MySQL.ReadOptions, "MySQLReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (MySQL.WriteOptions, "MySQLWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (Teradata.ReadOptions, "TeradataReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (Teradata.WriteOptions, "TeradataWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (Oracle.ReadOptions, "OracleReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (Oracle.WriteOptions, "OracleWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), ], ) def test_db_options_warn_for_unknown(options_class, options_class_name, known_options, caplog): @@ -65,18 +104,20 @@ def test_db_options_warn_for_unknown(options_class, options_class_name, known_op @pytest.mark.parametrize( "options_class,options", [ - ( - Postgres.ReadOptions, - Postgres.WriteOptions(), - ), - ( - Postgres.WriteOptions, - Postgres.ReadOptions(), - ), - ], - ids=[ - "Write options object passed to ReadOptions", - "Read options object passed to WriteOptions", + (Postgres.ReadOptions, Postgres.WriteOptions()), + (Postgres.WriteOptions, Postgres.ReadOptions()), + (Clickhouse.ReadOptions, Clickhouse.WriteOptions()), + (Clickhouse.WriteOptions, Clickhouse.ReadOptions()), + (MSSQL.ReadOptions, MSSQL.WriteOptions()), + (MSSQL.WriteOptions, MSSQL.ReadOptions()), + (MySQL.ReadOptions, MySQL.WriteOptions()), + (MySQL.WriteOptions, MySQL.ReadOptions()), + (Teradata.ReadOptions, Teradata.WriteOptions()), + (Teradata.WriteOptions, Teradata.ReadOptions()), + (Greenplum.ReadOptions, Greenplum.WriteOptions()), + (Greenplum.WriteOptions, Greenplum.ReadOptions()), + (Oracle.ReadOptions, Oracle.WriteOptions()), + (Oracle.WriteOptions, Oracle.ReadOptions()), ], ) def test_db_options_parse_mismatch_class(options_class, options): @@ -106,15 +147,36 @@ def test_db_options_parse_mismatch_connection_and_options_types(connection, opti @pytest.mark.parametrize( "options_class", [ + # PostgreSQL options Postgres.ReadOptions, Postgres.WriteOptions, Postgres.FetchOptions, Postgres.ExecuteOptions, + Postgres.Options, Greenplum.ReadOptions, Greenplum.WriteOptions, Hive.WriteOptions, - Postgres.Options, Hive.Options, + Clickhouse.ReadOptions, + Clickhouse.WriteOptions, + Clickhouse.FetchOptions, + Clickhouse.ExecuteOptions, + MSSQL.ReadOptions, + MSSQL.WriteOptions, + MSSQL.FetchOptions, + MSSQL.ExecuteOptions, + MySQL.ReadOptions, + MySQL.WriteOptions, + MySQL.FetchOptions, + MySQL.ExecuteOptions, + Teradata.ReadOptions, + Teradata.WriteOptions, + Teradata.FetchOptions, + Teradata.ExecuteOptions, + Oracle.ReadOptions, + Oracle.WriteOptions, + Oracle.FetchOptions, + Oracle.ExecuteOptions, ], ) @pytest.mark.parametrize( diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index 55c3c942e..f3b996140 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -233,8 +233,8 @@ def test_greenplum_write_options_default(): [ (Greenplum.ReadOptions, "GreenplumReadOptions"), (Greenplum.WriteOptions, "GreenplumWriteOptions"), - (Greenplum.FetchOptions, "JDBCFetchOptions"), - (Greenplum.ExecuteOptions, "JDBCExecuteOptions"), + (Greenplum.FetchOptions, "GreenplumFetchOptions"), + (Greenplum.ExecuteOptions, "GreenplumExecuteOptions"), (Greenplum.Extra, "GreenplumExtra"), ], ) diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index 90c0d1903..699838889 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -3,7 +3,7 @@ import pytest from onetl._internal import to_camel -from onetl.connection import Postgres +from onetl.connection import MSSQL, Clickhouse, MySQL, Oracle, Postgres, Teradata from onetl.connection.db_connection.jdbc_connection import JDBCTableExistBehavior pytestmark = [pytest.mark.postgres] @@ -44,21 +44,57 @@ def test_jdbc_options_default(): ("properties", {"abc": "cde"}), ], ) -@pytest.mark.parametrize("options_class", [Postgres.FetchOptions, Postgres.ExecuteOptions]) -def test_jdbc_read_write_options_populated_by_connection_class(arg, value, options_class): - error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCReadOptions" - with pytest.raises(ValueError, match=error_msg): - Postgres.ReadOptions.parse({arg: value}) - - error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCWriteOptions" - with pytest.raises(ValueError, match=error_msg): - Postgres.WriteOptions.parse({arg: value}) - - # FetchOptions & ExecuteOptions does not have such restriction - options = options_class.parse({arg: value}) - assert options.dict()[arg] == value +@pytest.mark.parametrize( + "options_class, read_write_restriction", + [ + (Postgres.FetchOptions, False), + (Postgres.ExecuteOptions, False), + (Postgres.ReadOptions, True), + (Postgres.WriteOptions, True), + (Clickhouse.FetchOptions, False), + (Clickhouse.ExecuteOptions, False), + (Clickhouse.ReadOptions, True), + (Clickhouse.WriteOptions, True), + (MSSQL.FetchOptions, False), + (MSSQL.ExecuteOptions, False), + (MSSQL.ReadOptions, True), + (MSSQL.WriteOptions, True), + (MySQL.FetchOptions, False), + (MySQL.ExecuteOptions, False), + (MySQL.ReadOptions, True), + (MySQL.WriteOptions, True), + (Teradata.FetchOptions, False), + (Teradata.ExecuteOptions, False), + (Teradata.ReadOptions, True), + (Teradata.WriteOptions, True), + (Oracle.FetchOptions, False), + (Oracle.ExecuteOptions, False), + (Oracle.ReadOptions, True), + (Oracle.WriteOptions, True), + ], +) +def test_jdbc_read_write_options_populated_by_connection_class(arg, value, options_class, read_write_restriction): + if read_write_restriction: + error_msg = rf"Options \['{arg}'\] are not allowed to use in a {options_class.__name__}" + with pytest.raises(ValueError, match=error_msg): + options_class.parse({arg: value}) + else: + # FetchOptions & ExecuteOptions does not have such restriction + options = options_class.parse({arg: value}) + assert options.dict()[arg] == value +@pytest.mark.parametrize( + "options_class, options_class_name", + [ + (Postgres.ReadOptions, "PostgresReadOptions"), + (Clickhouse.ReadOptions, "ClickhouseReadOptions"), + (MSSQL.ReadOptions, "MSSQLReadOptions"), + (MySQL.ReadOptions, "MySQLReadOptions"), + (Teradata.ReadOptions, "TeradataReadOptions"), + (Oracle.ReadOptions, "OracleReadOptions"), + ], +) @pytest.mark.parametrize( "arg, value", [ @@ -73,12 +109,23 @@ def test_jdbc_read_write_options_populated_by_connection_class(arg, value, optio ("createTableColumnTypes", "a varchar"), ], ) -def test_jdbc_write_options_cannot_be_used_in_read_options(arg, value): - error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCReadOptions" +def test_jdbc_write_options_cannot_be_used_in_read_options(arg, value, options_class, options_class_name): + error_msg = rf"Options \['{arg}'\] are not allowed to use in a {options_class_name}" with pytest.raises(ValueError, match=error_msg): - Postgres.ReadOptions.parse({arg: value}) + options_class.parse({arg: value}) +@pytest.mark.parametrize( + "options_class, options_class_name", + [ + (Postgres.WriteOptions, "PostgresWriteOptions"), + (Clickhouse.WriteOptions, "ClickhouseWriteOptions"), + (MSSQL.WriteOptions, "MSSQLWriteOptions"), + (MySQL.WriteOptions, "MySQLWriteOptions"), + (Teradata.WriteOptions, "TeradataWriteOptions"), + (Oracle.WriteOptions, "OracleWriteOptions"), + ], +) @pytest.mark.parametrize( "arg, value", [ @@ -101,10 +148,10 @@ def test_jdbc_write_options_cannot_be_used_in_read_options(arg, value): ("predicates", "s"), ], ) -def test_jdbc_read_options_cannot_be_used_in_write_options(arg, value): - error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCWriteOptions" +def test_jdbc_read_options_cannot_be_used_in_write_options(options_class, options_class_name, arg, value): + error_msg = rf"Options \['{arg}'\] are not allowed to use in a {options_class_name}" with pytest.raises(ValueError, match=error_msg): - Postgres.WriteOptions.parse({arg: value}) + options_class.parse({arg: value}) @pytest.mark.parametrize( @@ -137,12 +184,23 @@ def test_jdbc_old_options_allowed_but_deprecated(arg, value): assert options.dict(by_alias=True)[to_camel(arg)] == value -def test_jdbc_read_options_partitioning_is_not_valid(): +@pytest.mark.parametrize( + "options_class", + [ + Postgres.ReadOptions, + Clickhouse.ReadOptions, + MSSQL.ReadOptions, + MySQL.ReadOptions, + Teradata.ReadOptions, + Oracle.ReadOptions, + ], +) +def test_jdbc_read_options_partitioning_is_not_valid(options_class): with pytest.raises(ValueError): - Postgres.ReadOptions(numPartitions=200) + options_class(numPartitions=200) with pytest.raises(ValueError): - Postgres.ReadOptions(partitionColumn="test") + options_class(partitionColumn="test") def test_jdbc_read_options_case(): @@ -254,14 +312,19 @@ def test_jdbc_write_options_mode_deprecated(options, value, message): @pytest.mark.parametrize( - "options", + "options_class, options", [ - {"mode": "wrong_mode"}, + (Postgres.WriteOptions, {"mode": "wrong_mode"}), + (Clickhouse.WriteOptions, {"mode": "wrong_mode"}), + (MSSQL.WriteOptions, {"mode": "wrong_mode"}), + (MySQL.WriteOptions, {"mode": "wrong_mode"}), + (Teradata.WriteOptions, {"mode": "wrong_mode"}), + (Oracle.WriteOptions, {"mode": "wrong_mode"}), ], ) -def test_jdbc_write_options_mode_wrong(options): +def test_jdbc_write_options_mode_wrong(options_class, options): with pytest.raises(ValueError, match="value is not a valid enumeration member"): - Postgres.WriteOptions(**options) + options_class(**options) @pytest.mark.parametrize( @@ -277,13 +340,35 @@ def test_jdbc_sql_options_partition_bounds(options, expected_message): Postgres.SQLOptions(**options) -def test_jdbc_sql_options_partitioning_mode_prohibited(): +@pytest.mark.parametrize( + "options_class", + [ + Postgres.SQLOptions, + Clickhouse.SQLOptions, + MSSQL.SQLOptions, + MySQL.SQLOptions, + Teradata.SQLOptions, + Oracle.SQLOptions, + ], +) +def test_jdbc_sql_options_partitioning_mode_prohibited(options_class): with pytest.raises(ValueError, match=r"Options \['partitioning_mode'\] are not allowed"): - Postgres.SQLOptions(partitioning_mode="range") + options_class(partitioning_mode="range") -def test_jdbc_sql_options_default(): - options = Postgres.SQLOptions() +@pytest.mark.parametrize( + "options_class", + [ + Postgres.SQLOptions, + Clickhouse.SQLOptions, + MSSQL.SQLOptions, + MySQL.SQLOptions, + Teradata.SQLOptions, + Oracle.SQLOptions, + ], +) +def test_jdbc_sql_options_default(options_class): + options = options_class() assert options.fetchsize == 100_000 assert options.query_timeout is None From 2a7e013d34a8ba42f387616a5e28c777476306e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 23 May 2024 09:34:28 +0000 Subject: [PATCH 57/71] [DOP-13900] Add note about connecting to Clickhouse cluster --- .../next_release/280.improvement.rst | 1 + .../clickhouse/prerequisites.rst | 26 +++++-- .../db_connection/clickhouse/types.rst | 68 ++++++++++++------- .../db_connection/kafka/prerequisites.rst | 20 +++--- .../db_connection/postgres/types.rst | 2 +- docs/file/file_downloader/result.rst | 2 +- docs/file/file_mover/result.rst | 2 +- docs/file/file_uploader/result.rst | 2 +- 8 files changed, 79 insertions(+), 44 deletions(-) create mode 100644 docs/changelog/next_release/280.improvement.rst diff --git a/docs/changelog/next_release/280.improvement.rst b/docs/changelog/next_release/280.improvement.rst new file mode 100644 index 000000000..55432ef45 --- /dev/null +++ b/docs/changelog/next_release/280.improvement.rst @@ -0,0 +1 @@ +Add note about connecting to Clickhouse cluster. diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst index f7ade0341..03384b1a0 100644 --- a/docs/connection/db_connection/clickhouse/prerequisites.rst +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -21,7 +21,7 @@ BEFORE creating the connector instance. See :ref:`install-spark` installation instruction for more details. Connecting to Clickhouse ------------------------ +------------------------ Connection port ~~~~~~~~~~~~~~~ @@ -30,11 +30,27 @@ Connector can only use **HTTP** (usually ``8123`` port) or **HTTPS** (usually `` TCP and GRPC protocols are NOT supported. -Clickhouse cluster interaction -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Connecting to cluster +~~~~~~~~~~~~~~~~~~~~~ -If you're using Clickhouse cluster, it is currently possible to connect only to one specific cluster node. -Connecting to multiple nodes simultaneously is not supported. +It is possible to connect to Clickhouse cluster, and use it's load balancing capabilities to read or write data in parallel. +Each Spark executor can connect to random Clickhouse nodes, instead of sending all the data to a node specified in connection params. + +This requires all Clickhouse servers to run on different hosts, and **listen the same HTTP port**. +Set ``auto_discovery=True`` to enable this feature (disabled by default): + +.. code:: python + + Clickhouse( + host="node1.of.cluster", + port=8123, + extra={ + "auto_discovery": True, + "load_balancing_policy": "roundRobin", + }, + ) + +See `official documentation `_. Required grants ~~~~~~~~~~~~~~~ diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 9c579c08d..ca024d36a 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -125,9 +125,9 @@ Numeric types ~~~~~~~~~~~~~ +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +================================+===================================+===============================+===============================+ -| ``Bool`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | +| ``Bool`` | ``BooleanType()`` | ``Bool`` | ``UInt64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal`` | ``DecimalType(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -158,8 +158,8 @@ Numeric types | ``Int64`` | ``LongType()`` | ``Int64`` | ``Int64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int128`` | unsupported [3]_ | | | -+--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Int256`` | unsupported [3]_ | | | ++--------------------------------+ | | | +| ``Int256`` | | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``-`` | ``ByteType()`` | ``Int8`` | ``Int8`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -198,22 +198,27 @@ Notes: +===================================+======================================+==================================+===============================+ | ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``Date32`` | ``DateType()`` | ``Date`` | ``Date`` | -| | | | **cannot be inserted** [6]_ | +| ``Date32`` | ``DateType()`` | ``Date`` | ``Date``, | +| | | | **cannot insert data** [4]_ | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime32``, seconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | ``DateTime32`` | -+-----------------------------------+--------------------------------------+----------------------------------+ seconds | -| ``DateTime64(3)``, milliseconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | **precision loss** [4]_ | -+-----------------------------------+--------------------------------------+----------------------------------+ | -| ``DateTime64(6)``, microseconds | ``TimestampType()`` | ``DateTime64(6)``, microseconds | | -+-----------------------------------+--------------------------------------+----------------------------------+ | -| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()`` | ``DateTime64(6)`` | | -| | | microseconds | | -| | | **precision loss** [4]_ | | +| ``DateTime32``, seconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``-`` | ``TimestampNTZType()`` | ``DateTime64(6)`` | | +| ``DateTime64(3)``, milliseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | +| | | | **precision loss** [5]_ | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``IntervalNanosecond`` | ``LongType()`` | ```Int64`` | ``Int64`` | +| ``DateTime64(6)``, microseconds | ``TimestampType()``, microseconds | | ``DateTime32``, seconds, | ++-----------------------------------+--------------------------------------+ | **precision loss** [7]_ | +| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, microseconds, | | | +| | **precision loss** [6]_ | | | +| | | | | ++-----------------------------------+--------------------------------------+ | | +| ``-`` | ``TimestampNTZType()``, microseconds | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime32(TZ)`` | unsupported [7]_ | | | ++-----------------------------------+ | | | +| ``DateTime64(P, TZ)`` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalNanosecond`` | ``LongType()`` | ``Int64`` | ``Int64`` | +-----------------------------------+ | | | | ``IntervalMicrosecond`` | | | | +-----------------------------------+ | | | @@ -262,17 +267,27 @@ Notes: * `Spark TimestampType documentation `_ .. [4] - Clickhouse support datetime up to nanoseconds precision (``23:59:59.999999999``), - but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). - Nanoseconds will be lost during read or write operations. + ``Date32`` has different bytes representation than ``Date``, and inserting value of type ``Date32`` to ``Date`` column + leads to errors on Clickhouse side, e.g. ``Date(106617) should be between 0 and 65535 inclusive of both values``. + Although Spark does properly read the ``Date32`` column as ``DateType()``, and there should be no difference at all. + Probably this is some bug in Clickhouse driver. .. [5] Generic JDBC dialect generates DDL with Clickhouse type ``TIMESTAMP`` which is alias for ``DateTime32`` with precision up to seconds (``23:59:59``). Inserting data with milliseconds precision (``23:59:59.999``) will lead to **throwing away milliseconds**. + Solution: create table manually, with proper column type. .. [6] - Clickhouse will raise an exception that data in format ``2001-01-01 23:59:59.999999`` has data ``.999999`` which does not match format ``YYYY-MM-DD hh:mm:ss``. - So you can create Clickhouse table with Spark, but cannot write data to column of this type. + Clickhouse support datetime up to nanoseconds precision (``23:59:59.999999999``), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). + Nanoseconds will be lost during read or write operations. + Solution: create table manually, with proper column type. + +.. [7] + Clickhouse will raise an exception that data in format ``2001-01-01 23:59:59.999999`` has data ``.999999`` which does not match format ``YYYY-MM-DD hh:mm:ss`` + of ``DateTime32`` column type (see [5]_). + So Spark can create Clickhouse table, but cannot write data to column of this type. + Solution: create table manually, with proper column type. String types ~~~~~~~~~~~~~ @@ -291,6 +306,8 @@ String types | ``IPv4`` | | | | +--------------------------------------+ | | | | ``IPv6`` | | | | ++--------------------------------------+ | | | +| ``UUID`` | | | | +--------------------------------------+------------------+ | | | ``-`` | ``BinaryType()`` | | | +--------------------------------------+------------------+------------------------+--------------------------+ @@ -311,7 +328,6 @@ Columns of these Clickhouse types cannot be read by Spark: * ``Ring`` * ``SimpleAggregateFunction(func, T)`` * ``Tuple(T1, T2, ...)`` - * ``UUID`` Dataframe with these Spark types be written to Clickhouse: * ``ArrayType(T)`` @@ -359,9 +375,10 @@ For parsing JSON columns in ClickHouse, :obj:`JSON.parse_column ` (not secure) - * :obj:`SSL ` (secure, recommended) + * :obj:`PLAINTEXT ` (not secure) + * :obj:`SSL ` (secure, recommended) Note that specific port can listen for only one of these protocols, so it is important to set proper port number + protocol combination. @@ -47,16 +47,16 @@ Authentication mechanism Kafka can support different authentication mechanism (also known as `SASL `_). List of currently supported mechanisms: - * :obj:`PLAIN `. To no confuse this with ``PLAINTEXT`` connection protocol, onETL uses name ``BasicAuth``. - * :obj:`GSSAPI `. To simplify naming, onETL uses name ``KerberosAuth``. - * :obj:`SCRAM-SHA-256 or SCRAM-SHA-512 ` (recommended). + * :obj:`PLAIN `. To no confuse this with ``PLAINTEXT`` connection protocol, onETL uses name ``BasicAuth``. + * :obj:`GSSAPI `. To simplify naming, onETL uses name ``KerberosAuth``. + * :obj:`SCRAM-SHA-256 or SCRAM-SHA-512 ` (recommended). Different mechanisms use different types of credentials (login + password, keytab file, and so on). Note that connection protocol and auth mechanism are set in pairs: - * If you see ``SASL_PLAINTEXT`` this means ``PLAINTEXT`` connection protocol + some auth mechanism. - * If you see ``SASL_SSL`` this means ``SSL`` connection protocol + some auth mechanism. - * If you see just ``PLAINTEXT`` or ``SSL`` (**no** ``SASL``), this means that authentication is disabled (anonymous access). + * If you see ``SASL_PLAINTEXT`` this means ``PLAINTEXT`` connection protocol + some auth mechanism. + * If you see ``SASL_SSL`` this means ``SSL`` connection protocol + some auth mechanism. + * If you see just ``PLAINTEXT`` or ``SSL`` (**no** ``SASL``), this means that authentication is disabled (anonymous access). Please contact your Kafka administrator to get details about enabled auth mechanism in a specific Kafka instance. @@ -64,7 +64,7 @@ Required grants ~~~~~~~~~~~~~~~ Ask your Kafka administrator to set following grants for a user, *if Kafka instance uses ACL*: - * ``Describe`` + ``Read`` for reading data from Kafka (Consumer). - * ``Describe`` + ``Write`` for writing data from Kafka (Producer). + * ``Describe`` + ``Read`` for reading data from Kafka (Consumer). + * ``Describe`` + ``Write`` for writing data from Kafka (Producer). More details can be found in `documentation `_. diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index abca54f34..94351f234 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -27,7 +27,7 @@ This is how Postgres connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. -* Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +* Take only columns present in DataFrame (by name, case insensitive) [2]_. For each found column get Postgres type. * Find corresponding ``Spark type`` -> ``Postgres type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Postgres type (write)`` match ``Postgres type (read)``, no additional casts will be performed, DataFrame column will be written to Postgres as is. * If ``Postgres type (write)`` does not match ``Postgres type (read)``, DataFrame column will be casted to target column type **on Postgres side**. diff --git a/docs/file/file_downloader/result.rst b/docs/file/file_downloader/result.rst index 8fd20e9df..dec2b2dd4 100644 --- a/docs/file/file_downloader/result.rst +++ b/docs/file/file_downloader/result.rst @@ -6,4 +6,4 @@ File Downloader Result .. currentmodule:: onetl.file.file_downloader.result .. autoclass:: DownloadResult - :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, reraise_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json diff --git a/docs/file/file_mover/result.rst b/docs/file/file_mover/result.rst index d4ea950f3..c77340bc7 100644 --- a/docs/file/file_mover/result.rst +++ b/docs/file/file_mover/result.rst @@ -6,4 +6,4 @@ File Mover Result .. currentmodule:: onetl.file.file_mover.result .. autoclass:: MoveResult - :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, reraise_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json diff --git a/docs/file/file_uploader/result.rst b/docs/file/file_uploader/result.rst index af20ace14..f83acf9a5 100644 --- a/docs/file/file_uploader/result.rst +++ b/docs/file/file_uploader/result.rst @@ -6,4 +6,4 @@ File Uploader Result .. currentmodule:: onetl.file.file_uploader.result .. autoclass:: UploadResult - :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, reraise_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json From 827c1deb828776dd5f6f87d6c6ba52b2d87fa059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 22 May 2024 15:54:25 +0000 Subject: [PATCH 58/71] [DOP-14042] Improve SparkHDFS and SparkS3 documentation --- .../next_release/279.improvement.rst | 1 + .../file_df_connection/spark_hdfs/index.rst | 3 +- .../spark_hdfs/prerequisites.rst | 48 +++++++++++++ .../file_df_connection/spark_s3/index.rst | 3 +- .../spark_s3/prerequisites.rst | 68 +++++++++++++++++++ .../spark_s3/troubleshooting.rst | 14 ++-- .../spark_hdfs/connection.py | 34 +++------- .../file_df_connection/spark_local_fs.py | 11 +-- .../file_df_connection/spark_s3/connection.py | 24 +------ 9 files changed, 135 insertions(+), 71 deletions(-) create mode 100644 docs/changelog/next_release/279.improvement.rst create mode 100644 docs/connection/file_df_connection/spark_hdfs/prerequisites.rst create mode 100644 docs/connection/file_df_connection/spark_s3/prerequisites.rst diff --git a/docs/changelog/next_release/279.improvement.rst b/docs/changelog/next_release/279.improvement.rst new file mode 100644 index 000000000..02653343e --- /dev/null +++ b/docs/changelog/next_release/279.improvement.rst @@ -0,0 +1 @@ +Add "Prerequisites" page describing different aspects of using SparkHDFS and SparkS3 connectors. diff --git a/docs/connection/file_df_connection/spark_hdfs/index.rst b/docs/connection/file_df_connection/spark_hdfs/index.rst index f7769c0d9..e4f79d6a2 100644 --- a/docs/connection/file_df_connection/spark_hdfs/index.rst +++ b/docs/connection/file_df_connection/spark_hdfs/index.rst @@ -1,12 +1,13 @@ .. _spark-hdfs: Spark HDFS -========================== +========== .. toctree:: :maxdepth: 1 :caption: Connection + prerequisites Connection .. toctree:: diff --git a/docs/connection/file_df_connection/spark_hdfs/prerequisites.rst b/docs/connection/file_df_connection/spark_hdfs/prerequisites.rst new file mode 100644 index 000000000..8cac36de1 --- /dev/null +++ b/docs/connection/file_df_connection/spark_hdfs/prerequisites.rst @@ -0,0 +1,48 @@ +.. _spark-hdfs-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Hadoop versions: 2.x, 3.x (only with Hadoop 3.x libraries) +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +Installing PySpark +------------------ + +To use SparkHDFS connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Using Kerberos +-------------- + +Some of Hadoop managed clusters use Kerberos authentication. In this case, you should call `kinit `_ command +**BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. + +Sometimes it is also required to pass keytab file to Spark config, allowing Spark executors to generate own Kerberos tickets: + +.. tabs:: + + .. code-tab:: python Spark 3 + + SparkSession.builder + .option("spark.kerberos.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.kerberos.principal", "user") + .option("spark.kerberos.keytab", "/path/to/keytab") + .gerOrCreate() + + .. code-tab:: python Spark 2 + + SparkSession.builder + .option("spark.yarn.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.yarn.principal", "user") + .option("spark.yarn.keytab", "/path/to/keytab") + .gerOrCreate() + +See `Spark security documentation `_ +for more details. diff --git a/docs/connection/file_df_connection/spark_s3/index.rst b/docs/connection/file_df_connection/spark_s3/index.rst index f7a32250d..d086acd8a 100644 --- a/docs/connection/file_df_connection/spark_s3/index.rst +++ b/docs/connection/file_df_connection/spark_s3/index.rst @@ -7,5 +7,6 @@ Spark S3 :maxdepth: 1 :caption: Connection + prerequisites Connection - Troubleshooting Guide + Troubleshooting diff --git a/docs/connection/file_df_connection/spark_s3/prerequisites.rst b/docs/connection/file_df_connection/spark_s3/prerequisites.rst new file mode 100644 index 000000000..0d864e664 --- /dev/null +++ b/docs/connection/file_df_connection/spark_s3/prerequisites.rst @@ -0,0 +1,68 @@ +.. _spark-s3-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Spark versions: 3.2.x - 3.5.x (only with Hadoop 3.x libraries) +* Java versions: 8 - 20 + +Installing PySpark +------------------ + +To use SparkS3 connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to S3 +---------------- + +Bucket access style +~~~~~~~~~~~~~~~~~~~ + +AWS and some other S3 cloud providers allows bucket access using domain style only, e.g. ``https://mybucket.s3provider.com``. + +Other implementations, like Minio, by default allows path style access only, e.g. ``https://s3provider.com/mybucket`` +(see `MINIO_DOMAIN `_). + +You should set ``path.style.access`` to ``True`` or ``False``, to choose the preferred style. + +Authentication +~~~~~~~~~~~~~~ + +Different S3 instances can use different authentication methods, like: + * ``access_key + secret_key`` (or username + password) + * ``access_key + secret_key + session_token`` + +Usually these are just passed to SparkS3 constructor: + +.. code:: python + + SparkS3( + access_key=..., + secret_key=..., + session_token=..., + ) + +But some S3 cloud providers, like AWS, may require custom credential providers. You can pass them like: + +.. code:: python + + SparkS3( + extra={ + # provider class + "aws.credentials.provider": "org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider", + # other options, if needed + "assumed.role.arn": "arn:aws:iam::90066806600238:role/s3-restricted", + }, + ) + +See `Hadoop-AWS `_ documentation. + +Troubleshooting +--------------- + +See :ref:`spark-s3-troubleshooting`. diff --git a/docs/connection/file_df_connection/spark_s3/troubleshooting.rst b/docs/connection/file_df_connection/spark_s3/troubleshooting.rst index e3474c20e..a08669af9 100644 --- a/docs/connection/file_df_connection/spark_s3/troubleshooting.rst +++ b/docs/connection/file_df_connection/spark_s3/troubleshooting.rst @@ -215,13 +215,13 @@ If you change port number, this does not lead to changing protocol: .. code:: python - spark_s3 = SparkS3(host="s3.domain.com", port=8080, ...) + spark_s3 = SparkS3(host="s3provider.com", port=8080, ...) You should pass protocol explicitly: .. code:: python - spark_s3 = SparkS3(host="s3.domain.com", port=8080, protocol="http", ...) + spark_s3 = SparkS3(host="s3provider.com", port=8080, protocol="http", ...) SSL certificate is self-signed ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -251,20 +251,14 @@ Accessing S3 without domain-style access style support .. code:: txt - Caused by: java.net.UnknownHostException: my-bucket.s3.domain.com - -By default, Hadoop AWS uses domain-style access ``my-bucket.domain.com`` instead of path-style access ``domain.com/my-bucket``, -because this is default option for AWS S3. - -But some S3 implementations does not support domain-style access, e.g. MinIO by default allows only path-style access -(see `MINIO_DOMAIN `_). + Caused by: java.net.UnknownHostException: my-bucket.s3provider.com To use path-style access, use option below: .. code:: python spark_s3 = SparkS3( - host="s3.domain.com", + host="s3provider.com", bucket="my-bucket", ..., extra={ diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 1c0ca6bb3..677ffe3bd 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -39,29 +39,7 @@ class SparkHDFS(SparkFileDFConnection): .. warning:: - To use Hive connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. - - .. note:: - - Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit`` - **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. - - In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options - to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS. - See `Spark security documentation `_ - for more details. + Before using this connector please take into account :ref:`spark-hdfs-prerequisites` .. note:: @@ -105,7 +83,7 @@ class SparkHDFS(SparkFileDFConnection): Examples -------- - SparkHDFS connection initialization + SparkHDFS connection initialization: .. code:: python @@ -122,11 +100,15 @@ class SparkHDFS(SparkFileDFConnection): spark=spark, ).check() - SparkHDFS connection initialization with Kerberos support + SparkHDFS connection initialization with Kerberos support: + + .. code:: bash + + $ kinit -kt /path/to/keytab user .. code:: python - from onetl.connection import Hive + from onetl.connection import SparkHDFS from pyspark.sql import SparkSession # Create Spark session. diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index 0ebd43683..f65ceaabd 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -28,18 +28,9 @@ class SparkLocalFS(SparkFileDFConnection): .. warning:: - To use SparkLocalFS connector you should have PySpark installed (or injected to ``sys.path``) + To use SparkHDFS connector you should have PySpark installed (or injected to ``sys.path``) BEFORE creating the connector instance. - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - See :ref:`install-spark` installation instruction for more details. .. warning:: diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 12babf694..eaec3574f 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -53,31 +53,9 @@ class SparkS3(SparkFileDFConnection): Based on `Hadoop-AWS module `_ and `Spark integration with Cloud Infrastructures `_. - .. dropdown:: Version compatibility - - * Spark versions: 3.2.x - 3.5.x (only with Hadoop 3.x libraries) - * Scala versions: 2.12 - 2.13 - * Java versions: 8 - 20 - - .. warning:: - - See :ref:`spark-s3-troubleshooting` guide. - .. warning:: - To use SparkS3 connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`spark-s3-prerequisites` .. note:: From 22818a55b7dfc04ed8aae08e5559d04f51908147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 22 May 2024 15:10:56 +0000 Subject: [PATCH 59/71] [DOP-14042] Improve Hive documentation --- .../next_release/278.improvement.rst | 4 + .../db_connection/clickhouse/execute.rst | 2 +- .../db_connection/greenplum/execute.rst | 2 +- .../db_connection/greenplum/read.rst | 3 +- .../connection/db_connection/hive/execute.rst | 47 ++++- docs/connection/db_connection/hive/index.rst | 2 + .../db_connection/hive/prerequisites.rst | 130 ++++++++++++ docs/connection/db_connection/hive/read.rst | 101 ++++++++- docs/connection/db_connection/hive/sql.rst | 81 ++++++++ docs/connection/db_connection/hive/write.rst | 192 +++++++++++++++++- .../db_connection/mysql/execute.rst | 2 +- .../db_connection/postgres/execute.rst | 2 +- .../db_connection/teradata/execute.rst | 2 +- .../db_connection/hive/connection.py | 103 +--------- .../connection/db_connection/hive/options.py | 16 -- 15 files changed, 562 insertions(+), 127 deletions(-) create mode 100644 docs/changelog/next_release/278.improvement.rst create mode 100644 docs/connection/db_connection/hive/prerequisites.rst create mode 100644 docs/connection/db_connection/hive/sql.rst diff --git a/docs/changelog/next_release/278.improvement.rst b/docs/changelog/next_release/278.improvement.rst new file mode 100644 index 000000000..bbb362142 --- /dev/null +++ b/docs/changelog/next_release/278.improvement.rst @@ -0,0 +1,4 @@ +Improve Hive documentation: + * Add "Prerequisites" page describing different aspects of connecting to Hive + * Improve "Reading from" and "Writing to" page of Hive documentation, add more examples and recommendations. + * Improve "Executing statements in Hive" page of Hive documentation. diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index 2a8abdb70..f33369c57 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -69,7 +69,7 @@ Syntax support This method supports **any** query syntax supported by Clickhouse, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index c0470a396..d19f8ab9d 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -69,7 +69,7 @@ Syntax support This method supports **any** query syntax supported by Greenplum, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index 1c76c9c72..98674d55a 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -34,7 +34,8 @@ Supported DBReader features In case of Greenplum connector, ``DBReader`` does not generate raw ``SELECT`` query. Instead it relies on Spark SQL syntax which in some cases (using column projection and predicate pushdown) can be converted to Greenplum SQL. - So ``columns``, ``where`` and ``hwm.expression`` should be specified in Spark SQL syntax, not Greenplum SQL. + So ``columns``, ``where`` and ``hwm.expression`` should be specified in `Spark SQL `_ syntax, + not Greenplum SQL. This is OK: diff --git a/docs/connection/db_connection/hive/execute.rst b/docs/connection/db_connection/hive/execute.rst index ae32e61d2..dffadbf57 100644 --- a/docs/connection/db_connection/hive/execute.rst +++ b/docs/connection/db_connection/hive/execute.rst @@ -1,7 +1,52 @@ .. _hive-execute: Executing statements in Hive -============================ +============================= + +Use ``Hive.execute(...)`` to execute DDL and DML operations. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Hive, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``LOAD DATA ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, and so on +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``MSCK REPAIR TABLE ...``, and so on +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +.. warning:: + + Actually, query should be written using `SparkSQL `_ syntax, not HiveQL. + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Hive + + hive = Hive(...) + + hive.execute("DROP TABLE schema.table") + hive.execute( + """ + CREATE TABLE schema.table AS ( + id NUMBER, + key VARCHAR, + value DOUBLE + ) + PARTITION BY (business_date DATE) + STORED AS orc + """ + ) + +Details +------- .. currentmodule:: onetl.connection.db_connection.hive.connection diff --git a/docs/connection/db_connection/hive/index.rst b/docs/connection/db_connection/hive/index.rst index 9dd900b07..6d42666cc 100644 --- a/docs/connection/db_connection/hive/index.rst +++ b/docs/connection/db_connection/hive/index.rst @@ -7,6 +7,7 @@ Hive :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,6 +15,7 @@ Hive :caption: Operations read + sql write execute diff --git a/docs/connection/db_connection/hive/prerequisites.rst b/docs/connection/db_connection/hive/prerequisites.rst new file mode 100644 index 000000000..d690f918f --- /dev/null +++ b/docs/connection/db_connection/hive/prerequisites.rst @@ -0,0 +1,130 @@ +.. _hive-prerequisites: + +Prerequisites +============= + +.. note:: + + onETL's Hive connection is actually SparkSession with access to `Hive Thrift Metastore `_ + and HDFS/S3. + All data motion is made using Spark. Hive Metastore is used only to store tables and partitions metadata. + + This connector does **NOT** require Hive server. It also does **NOT** use Hive JDBC connector. + +Version Compatibility +--------------------- + +* Hive Metastore version: 0.12 - 3.1.3 (may require to add proper .jar file explicitly) +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + + +Installing PySpark +------------------ + +To use Hive connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Hive Metastore +---------------------------- + +.. note:: + + If you're using managed Hadoop cluster, skip this step, as all Spark configs are should already present on the host. + +Create ``$SPARK_CONF_DIR/hive-site.xml`` with Hive Metastore URL: + +.. code:: xml + + + + + + hive.metastore.uris + thrift://metastore.host.name:9083 + + + +Create ``$SPARK_CONF_DIR/core-site.xml`` with warehouse location ,e.g. HDFS IPC port of Hadoop namenode, or S3 bucket address & credentials: + +.. tabs:: + + .. code-tab:: xml HDFS + + + + + + fs.defaultFS + hdfs://myhadoopcluster:9820 + + + + .. code-tab:: xml S3 + + + + + !-- See https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html#General_S3A_Client_configuration + + fs.defaultFS + s3a://mys3bucket/ + + + fs.s3a.bucket.mybucket.endpoint + http://s3.somain + + + fs.s3a.bucket.mybucket.connection.ssl.enabled + false + + + fs.s3a.bucket.mybucket.path.style.access + true + + + fs.s3a.bucket.mybucket.aws.credentials.provider + org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider + + + fs.s3a.bucket.mybucket.access.key + some-user + + + fs.s3a.bucket.mybucket.secret.key + mysecrettoken + + + +Using Kerberos +-------------- + +Some of Hadoop managed clusters use Kerberos authentication. In this case, you should call `kinit `_ command +**BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. + +Sometimes it is also required to pass keytab file to Spark config, allowing Spark executors to generate own Kerberos tickets: + +.. tabs:: + + .. code-tab:: python Spark 3 + + SparkSession.builder + .option("spark.kerberos.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.kerberos.principal", "user") + .option("spark.kerberos.keytab", "/path/to/keytab") + .gerOrCreate() + + .. code-tab:: python Spark 2 + + SparkSession.builder + .option("spark.yarn.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.yarn.principal", "user") + .option("spark.yarn.keytab", "/path/to/keytab") + .gerOrCreate() + +See `Spark security documentation `_ +for more details. diff --git a/docs/connection/db_connection/hive/read.rst b/docs/connection/db_connection/hive/read.rst index a9961b4ab..fb8091055 100644 --- a/docs/connection/db_connection/hive/read.rst +++ b/docs/connection/db_connection/hive/read.rst @@ -1,13 +1,100 @@ .. _hive-read: -Reading from Hive -================= +Reading from Hive using ``DBReader`` +==================================== -There are 2 ways of distributed data reading from Hive: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Hive.sql ` +Supported DBReader features +--------------------------- -.. currentmodule:: onetl.connection.db_connection.hive.connection +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Hive) +* ❌ ``df_schema`` +* ❌ ``options`` (only Spark config params are used) -.. automethod:: Hive.sql +.. warning:: + + Actually, ``columns``, ``where`` and ``hwm.expression`` should be written using `SparkSQL `_ syntax, + not HiveQL. + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Hive + from onetl.db import DBReader + + hive = Hive(...) + + reader = DBReader( + connection=hive, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Hive + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + hive = Hive(...) + + reader = DBReader( + connection=hive, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="hive_hwm", expression="updated_dt"), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Use column-based write formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Prefer these write formats: + * `ORC `_ + * `Parquet `_ + * `Iceberg `_ + * `Hudi `_ + * `Delta `_ + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. +This drastically reduces the amount of data read by Spark, **if column-based file formats are used**. + +Use partition columns in ``where`` clause +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Queries should include ``WHERE`` clause with filters on Hive partitioning columns. +This allows Spark to read only small set of files (*partition pruning*) instead of scanning the entire table, so this drastically increases performance. + +Supported operators are: ``=``, ``>``, ``<`` and ``BETWEEN``, and only against some **static** value. diff --git a/docs/connection/db_connection/hive/sql.rst b/docs/connection/db_connection/hive/sql.rst new file mode 100644 index 000000000..7b02ec2b7 --- /dev/null +++ b/docs/connection/db_connection/hive/sql.rst @@ -0,0 +1,81 @@ +.. _hive-sql: + +Reading from Hive using ``Hive.sql`` +==================================== + +``Hive.sql`` allows passing custom SQL query, but does not support incremental strategies. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +.. warning:: + + Actually, query should be written using `SparkSQL `_ syntax, not HiveQL. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Hive + + hive = Hive(...) + df = hive.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """ + ) + +Recommendations +--------------- + +Use column-based write formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Prefer these write formats: + * `ORC `_ + * `Parquet `_ + * `Iceberg `_ + * `Hudi `_ + * `Delta `_ + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This drastically reduces the amount of data read by Spark, **if column-based file formats are used**. + +Use partition columns in ``where`` clause +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Queries should include ``WHERE`` clause with filters on Hive partitioning columns. +This allows Spark to read only small set of files (*partition pruning*) instead of scanning the entire table, so this drastically increases performance. + +Supported operators are: ``=``, ``>``, ``<`` and ``BETWEEN``, and only against some **static** value. + +Details +------- + +.. currentmodule:: onetl.connection.db_connection.hive.connection + +.. automethod:: Hive.sql diff --git a/docs/connection/db_connection/hive/write.rst b/docs/connection/db_connection/hive/write.rst index 70c9f3099..0b286a83d 100644 --- a/docs/connection/db_connection/hive/write.rst +++ b/docs/connection/db_connection/hive/write.rst @@ -1,9 +1,195 @@ .. _hive-write: -Writing to Hive -=============== +Writing to Hive using ``DBWriter`` +=================================== -For writing data to Hive, use :obj:`DBWriter ` with options below. +For writing data to Hive, use :obj:`DBWriter `. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Hive + from onetl.db import DBWriter + + hive = Hive(...) + + df = ... # data is here + + # Create dataframe with specific number of Spark partitions. + # Use the Hive partitioning columns to group the data. Create only 20 files per Hive partition. + # Also sort the data by column which most data is correlated with, reducing files size. + write_df = df.repartition( + 20, "country", "business_date", "user_id" + ).sortWithinPartitions("country", "business_date", "user_id") + + writer = DBWriter( + connection=hive, + target="schema.table", + options=Hive.WriteOptions( + if_exists="append", + # Hive partitioning columns. + # `user_id`` column is not included, as it has a lot of distinct values. + partitionBy=["country", "business_date"], + ), + ) + + writer.run(write_df) + +Recommendations +--------------- + +Use column-based write formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Prefer these write formats: + * `ORC `_ + * `Parquet `_ + * `Iceberg `_ + * `Hudi `_ + * `Delta `_ + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +Use partitioning +~~~~~~~~~~~~~~~~ + +How does it work +^^^^^^^^^^^^^^^^ + +Hive support splitting data to partitions, which are different directories in filesystem with names like ``some_col=value1/another_col=value2``. + +For example, dataframe with content like this: + ++-----------------+---------------------+--------------+-------------+ +| country: string | business_date: date | user_id: int | bytes: long | ++=================+=====================+==============+=============+ +| RU | 2024-01-01 | 1234 | 25325253525 | ++-----------------+---------------------+--------------+-------------+ +| RU | 2024-01-01 | 2345 | 23234535243 | ++-----------------+---------------------+--------------+-------------+ +| RU | 2024-01-02 | 1234 | 62346634564 | ++-----------------+---------------------+--------------+-------------+ +| US | 2024-01-01 | 5678 | 4252345354 | ++-----------------+---------------------+--------------+-------------+ +| US | 2024-01-02 | 5678 | 5474575745 | ++-----------------+---------------------+--------------+-------------+ +| US | 2024-01-03 | 5678 | 3464574567 | ++-----------------+---------------------+--------------+-------------+ + +With ``partition_by=["country", "business_dt"]`` data will be stored as files in the following subfolders: + * ``/country=RU/business_date=2024-01-01/`` + * ``/country=RU/business_date=2024-01-02/`` + * ``/country=US/business_date=2024-01-01/`` + * ``/country=US/business_date=2024-01-02/`` + * ``/country=US/business_date=2024-01-03/`` + +A separated subdirectory is created for each distinct combination of column values in the dataframe. + +Please do not confuse Spark dataframe partitions (a.k.a batches of data handled by Spark executors, usually in parallel) +and Hive partitioning (store data in different subdirectories). +Number of Spark dataframe partitions is correlated the number of files created in **each** Hive partition. +For example, Spark dataframe with 10 partitions and 5 distinct values of Hive partition columns will be saved as 5 subfolders with 10 files each = 50 files in total. +Without Hive partitioning, all the files are placed into one flat directory. + +But why? +^^^^^^^^ + +Queries which has ``WHERE`` clause with filters on Hive partitioning columns, like ``WHERE country = 'RU' AND business_date='2024-01-01'``, will +read only files from this exact partitions, like ``/country=RU/business_date=2024-01-01/``, and skip files from other partitions. + +This drastically increases performance and reduces the amount of memory used by Spark. +Consider using Hive partitioning in all tables. + +Which columns should I use? +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Usually Hive partitioning columns are based on event date or location, like ``country: string``, ``business_date: date``, ``run_date: date`` and so on. + +**Partition columns should contain data with low cardinality.** +Dates, small integers, strings with low number of possible values are OK. +But timestamp, float, decimals, longs (like user id), strings with lots oj unique values (like user name or email) should **NOT** be used as Hive partitioning columns. +Unlike some other databases, range and hash-based partitions are not supported. + +Partition column should be a part of a dataframe. If you want to partition values by date component of ``business_dt: timestamp`` column, +add a new column to dataframe like this: ``df.withColumn("business_date", date(df.business_dt))``. + +Use compression +~~~~~~~~~~~~~~~ + +Using compression algorithms like ``snappy``, ``lz4`` or ``zstd`` can reduce the size of files (up to 10x). + +Prefer creating large files +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Storing millions of small files is not that HDFS and S3 are designed for. Minimal file size should be at least 10Mb, but usually it is like 128Mb+ or 256Mb+ (HDFS block size). +**NEVER** create files with few Kbytes in size. + +Number of files can be different in different cases. +On one hand, Spark Adaptive Query Execution (AQE) can merge small Spark dataframe partitions into one larger. +On the other hand, dataframes with skewed data can produce a larger number of files than expected. + +To create small amount of large files, you can reduce number of Spark dataframe partitions. +Use `df.repartition(N, columns...) `_ function, +like this: ``df.repartition(20, "col1", "col2")``. +This creates new Spark dataframe with partitions using ``hash(df.col1 + df.col2) mod 20`` expression, avoiding data skew. + +Note: larger dataframe partitions requires more resources (CPU, RAM) on Spark executor. The exact number of partitions +should be determined empirically, as it depends on the amount of data and available resources. + +Sort data before writing +~~~~~~~~~~~~~~~~~~~~~~~~ + +Dataframe with sorted content: + ++-----------------+---------------------+--------------+-------------------------+-------------+ +| country: string | business_date: date | user_id: int | business_dt: timestamp | bytes: long | ++=================+=====================+==============+=========================+=============+ +| RU | 2024-01-01 | 1234 | 2024-01-01T11:22:33.456 | 25325253525 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| RU | 2024-01-01 | 1234 | 2024-01-01T12:23:44.567 | 25325253525 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| RU | 2024-01-02 | 1234 | 2024-01-01T13:25:56.789 | 34335645635 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-01 | 2345 | 2024-01-01T10:00:00.000 | 12341 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-02 | 2345 | 2024-01-01T15:11:22.345 | 13435 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-03 | 2345 | 2024-01-01T20:22:33.567 | 14564 | ++-----------------+---------------------+--------------+-------------------------+-------------+ + +Has a much better compression rate than unsorted one, e.g. 2x or even higher: + ++-----------------+---------------------+--------------+-------------------------+-------------+ +| country: string | business_date: date | user_id: int | business_dt: timestamp | bytes: long | ++=================+=====================+==============+=========================+=============+ +| RU | 2024-01-01 | 1234 | 2024-01-01T11:22:33.456 | 25325253525 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| RU | 2024-01-01 | 6345 | 2024-12-01T23:03:44.567 | 25365 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| RU | 2024-01-02 | 5234 | 2024-07-01T06:10:56.789 | 45643456747 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-01 | 4582 | 2024-04-01T17:59:00.000 | 362546475 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-02 | 2345 | 2024-09-01T04:24:22.345 | 3235 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-03 | 3575 | 2024-03-01T21:37:33.567 | 346345764 | ++-----------------+---------------------+--------------+-------------------------+-------------+ + +Choosing columns to sort data by is really depends on the data. If data is correlated with some specific +column, like in example above the amount of traffic is correlated with both ``user_id`` and ``timestamp``, +use ``df.sortWithinPartitions("user_id", "timestamp")`` before writing the data. + +If ``df.repartition(N, repartition_columns...)`` is used in combination with ``df.sortWithinPartitions(sort_columns...)``, +then ``sort_columns`` should start with ``repartition_columns`` or be equal to it. + +Options +------- .. currentmodule:: onetl.connection.db_connection.hive.options diff --git a/docs/connection/db_connection/mysql/execute.rst b/docs/connection/db_connection/mysql/execute.rst index 477ad6fa7..f9e95f5e6 100644 --- a/docs/connection/db_connection/mysql/execute.rst +++ b/docs/connection/db_connection/mysql/execute.rst @@ -69,7 +69,7 @@ Syntax support This method supports **any** query syntax supported by MySQL, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on diff --git a/docs/connection/db_connection/postgres/execute.rst b/docs/connection/db_connection/postgres/execute.rst index 8c6fbc858..e8ef17b03 100644 --- a/docs/connection/db_connection/postgres/execute.rst +++ b/docs/connection/db_connection/postgres/execute.rst @@ -67,7 +67,7 @@ Syntax support This method supports **any** query syntax supported by Postgres, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on diff --git a/docs/connection/db_connection/teradata/execute.rst b/docs/connection/db_connection/teradata/execute.rst index 28be9d35b..300a45c30 100644 --- a/docs/connection/db_connection/teradata/execute.rst +++ b/docs/connection/db_connection/teradata/execute.rst @@ -64,7 +64,7 @@ Syntax support This method supports **any** query syntax supported by Teradata, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index f465fd73a..42f6f2a92 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -39,47 +39,9 @@ class Hive(DBConnection): """Spark connection with Hive MetaStore support. |support_hooks| - You don't need a Hive server to use this connector. - - .. dropdown:: Version compatibility - - * Hive metastore version: 0.12 - 3.1.2 (may require to add proper .jar file explicitly) - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - .. warning:: - - To use Hive connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. - .. warning:: - This connector requires some additional configuration files to be present (``hive-site.xml`` and so on), - as well as .jar files with Hive MetaStore client. - - See `Spark Hive Tables documentation `_ - and `this guide `_ for more details. - - .. note:: - - Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit`` - **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. - - In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options - to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS. - See `Spark security documentation `_ - for more details. + Before using this connector please take into account :ref:`hive-prerequisites` Parameters ---------- @@ -92,7 +54,7 @@ class Hive(DBConnection): Examples -------- - Hive connection initialization + Hive connection initialization: .. code:: python @@ -105,7 +67,11 @@ class Hive(DBConnection): # Create connection hive = Hive(cluster="rnd-dwh", spark=spark).check() - Hive connection initialization with Kerberos support + Hive connection initialization with Kerberos support: + + .. code:: bash + + $ kinit -kt /path/to/keytab user .. code:: python @@ -221,28 +187,13 @@ def sql( ---------- query : str - SQL query to be executed, like: - - * ``SELECT ... FROM ...`` - * ``WITH ... AS (...) SELECT ... FROM ...`` - * ``SHOW ...`` queries are also supported, like ``SHOW TABLES`` + SQL query to be executed. Returns ------- df : pyspark.sql.dataframe.DataFrame Spark dataframe - - Examples - -------- - - Read data from Hive table: - - .. code:: python - - connection = Hive(cluster="rnd-dwh", spark=spark) - - df = connection.sql("SELECT * FROM mytable") """ query = clear_statement(query) @@ -266,43 +217,7 @@ def execute( ---------- statement : str - Statement to be executed, like: - - DML statements: - - * ``INSERT INTO target_table SELECT * FROM source_table`` - * ``TRUNCATE TABLE mytable`` - - DDL statements: - - * ``CREATE TABLE mytable (...)`` - * ``ALTER TABLE mytable ...`` - * ``DROP TABLE mytable`` - * ``MSCK REPAIR TABLE mytable`` - - The exact list of supported statements depends on Hive version, - for example some new versions support ``CREATE FUNCTION`` syntax. - - Examples - -------- - - Create table: - - .. code:: python - - connection = Hive(cluster="rnd-dwh", spark=spark) - - connection.execute( - "CREATE TABLE mytable (id NUMBER, data VARCHAR) PARTITIONED BY (date DATE)" - ) - - Drop table partition: - - .. code:: python - - connection = Hive(cluster="rnd-dwh", spark=spark) - - connection.execute("ALTER TABLE mytable DROP PARTITION(date='2023-02-01')") + Statement to be executed. """ statement = clear_statement(statement) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 5f687112f..a196487a1 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -217,24 +217,8 @@ class Config: """ List of columns should be used for data partitioning. ``None`` means partitioning is disabled. - Each partition is a folder which contains only files with the specific column value, - like ``myschema.db/mytable/col1=value1``, ``myschema.db/mytable/col1=value2``, and so on. - - Multiple partitions columns means nested folder structure, like ``myschema.db/mytable/col1=val1/col2=val2``. - - If ``WHERE`` clause in the query contains expression like ``partition = value``, - Spark will scan only files in a specific partition. - Examples: ``reg_id`` or ``["reg_id", "business_dt"]`` - .. note:: - - Values should be scalars (integers, strings), - and either static (``countryId``) or incrementing (dates, years), with low - number of distinct values. - - Columns like ``userId`` or ``datetime``/``timestamp`` should **NOT** be used for partitioning. - .. warning:: Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` From d08ac20c1f14beb8a5b54dd9cb0618121052e7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 23 May 2024 15:21:51 +0000 Subject: [PATCH 60/71] [DOP-13851] Update MySQL package to 8.4.0 --- .github/workflows/data/mysql/matrix.yml | 4 ++-- docs/changelog/next_release/253.feature.rst | 2 +- docs/connection/db_connection/mysql/prerequisites.rst | 2 +- onetl/connection/db_connection/mysql/connection.py | 10 +++++----- .../tests_db_connection_unit/test_mysql_unit.py | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index cd96a63b9..8e46b42e8 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -21,14 +21,14 @@ latest: &latest matrix: small: - - mysql-version: 8.3.0 + - mysql-version: 8.4.0 <<: *max full: # Min supported version by JDBC driver is 5.7 - mysql-version: 5.7.6 <<: *min # Max supported version by JDBC driver is 8.3 - - mysql-version: 8.3.0 + - mysql-version: 8.4.0 <<: *max nightly: - mysql-version: 5.7.6 diff --git a/docs/changelog/next_release/253.feature.rst b/docs/changelog/next_release/253.feature.rst index 46b364d95..92994bdbb 100644 --- a/docs/changelog/next_release/253.feature.rst +++ b/docs/changelog/next_release/253.feature.rst @@ -1 +1 @@ -:class:`MySQL` connection now uses MySQL JDBC driver ``8.3.0``, upgraded from ``8.0.33``, and supports passing custom versions: ``MySQL.get_packages(package_version=...)``. +:class:`MySQL` connection now uses MySQL JDBC driver ``8.4.0``, upgraded from ``8.0.33``, and supports passing custom versions: ``MySQL.get_packages(package_version=...)``. diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst index 99b7820cb..225e630b2 100644 --- a/docs/connection/db_connection/mysql/prerequisites.rst +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -6,7 +6,7 @@ Prerequisites Version Compatibility --------------------- -* MySQL server versions: 5.7, 8.0 +* MySQL server versions: 5.7 - 8.4 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 6774870b9..a80bd7ac9 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -34,8 +34,8 @@ class Config: class MySQL(JDBCConnection): """MySQL JDBC connection. |support_hooks| - Based on Maven package ``com.mysql:mysql-connector-j:8.0.33`` - (`official MySQL JDBC driver `_). + Based on Maven package `com.mysql:mysql-connector-j:8.4.0 `_ + (`official MySQL JDBC driver `_). .. warning:: @@ -125,7 +125,7 @@ def get_packages(cls, package_version: str | None = None) -> list[str]: Parameters ---------- package_version : str, optional - Specifies the version of the MySQL JDBC driver to use. Defaults to ``8.3.0``. + Specifies the version of the MySQL JDBC driver to use. Defaults to ``8.4.0``. Examples -------- @@ -138,7 +138,7 @@ def get_packages(cls, package_version: str | None = None) -> list[str]: # specify a custom package version MySQL.get_packages(package_version="8.2.0") """ - default_version = "8.3.0" + default_version = "8.4.0" version = Version(package_version or default_version).min_digits(3) return [f"com.mysql:mysql-connector-j:{version}"] @@ -148,7 +148,7 @@ def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`MySQL.package` will be removed in 1.0.0, use `MySQL.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.mysql:mysql-connector-j:8.3.0" + return "com.mysql:mysql-connector-j:8.4.0" @property def jdbc_url(self) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py index da9267586..e95b34f1a 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py @@ -14,14 +14,14 @@ def test_mysql_class_attributes(): def test_mysql_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MySQL.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert MySQL.package == "com.mysql:mysql-connector-j:8.3.0" + assert MySQL.package == "com.mysql:mysql-connector-j:8.4.0" @pytest.mark.parametrize( "package_version, expected_packages", [ - (None, ["com.mysql:mysql-connector-j:8.3.0"]), - ("8.3.0", ["com.mysql:mysql-connector-j:8.3.0"]), + (None, ["com.mysql:mysql-connector-j:8.4.0"]), + ("8.4.0", ["com.mysql:mysql-connector-j:8.4.0"]), ("8.1.0", ["com.mysql:mysql-connector-j:8.1.0"]), ("8.0.33", ["com.mysql:mysql-connector-j:8.0.33"]), ], From 67abb39e1de3c3c18ea4190add7053d10b3ce65e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 23 May 2024 15:19:10 +0000 Subject: [PATCH 61/71] [DOP-13850] Update Oracle package to 23.4.0.24.05 --- docs/changelog/next_release/252.feature.rst | 2 +- .../db_connection/oracle/connection.py | 10 +++++----- .../tests_db_connection_unit/test_oracle_unit.py | 16 ++++++++-------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/changelog/next_release/252.feature.rst b/docs/changelog/next_release/252.feature.rst index 497b3c4ae..db7c72960 100644 --- a/docs/changelog/next_release/252.feature.rst +++ b/docs/changelog/next_release/252.feature.rst @@ -1 +1 @@ -:class:`Oracle` connection now uses Oracle JDBC driver ``23.3.0.0.23.09``, upgraded from ``23.2.0.0``, and supports passing custom versions: ``Oracle.get_packages(java_version=..., package_version=...)``. +:class:`Oracle` connection now uses Oracle JDBC driver ``23.4.0.24.05``, upgraded from ``23.2.0.0``, and supports passing custom versions: ``Oracle.get_packages(java_version=..., package_version=...)``. diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 28494bdde..612cfd9e5 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -81,7 +81,7 @@ class Config: class Oracle(JDBCConnection): """Oracle JDBC connection. |support_hooks| - Based on Maven package ``com.oracle.database.jdbc:ojdbc8:23.2.0.0`` + Based on Maven package `com.oracle.database.jdbc:ojdbc8:23.4.0.24.05 `_ (`official Oracle JDBC driver `_). .. warning:: @@ -207,7 +207,7 @@ def get_packages( java_version : str, optional Java major version, defaults to "8". Must be "8" or "11". package_version : str, optional - Specifies the version of the Oracle JDBC driver to use. Defaults to "23.3.0.0.23.09". + Specifies the version of the Oracle JDBC driver to use. Defaults to "23.4.0.24.05". Examples -------- @@ -219,11 +219,11 @@ def get_packages( Oracle.get_packages() # specify Java and package versions - Oracle.get_packages(java_version="8", package_version="23.2.0.0") + Oracle.get_packages(java_version="8", package_version="23.4.0.24.05") """ default_java_version = "8" - default_package_version = "23.3.0.23.09" + default_package_version = "23.4.0.24.05" java_ver = Version(java_version or default_java_version) if java_ver.major < 8: @@ -239,7 +239,7 @@ def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`Oracle.package` will be removed in 1.0.0, use `Oracle.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.oracle.database.jdbc:ojdbc8:23.3.0.23.09" + return "com.oracle.database.jdbc:ojdbc8:23.4.0.24.05" @property def jdbc_url(self) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py index d4db6940e..892c86906 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py @@ -14,11 +14,11 @@ def test_oracle_class_attributes(): def test_oracle_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Oracle.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert Oracle.package == "com.oracle.database.jdbc:ojdbc8:23.3.0.23.09" + assert Oracle.package == "com.oracle.database.jdbc:ojdbc8:23.4.0.24.05" def test_oracle_get_packages_no_input(): - assert Oracle.get_packages() == ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"] + assert Oracle.get_packages() == ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"] @pytest.mark.parametrize("java_version", ["7", "6"]) @@ -30,16 +30,16 @@ def test_oracle_get_packages_java_version_not_supported(java_version): @pytest.mark.parametrize( "java_version, package_version, expected_packages", [ - (None, None, ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"]), - ("8", None, ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"]), - ("8", "23.3.0.23.09", ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"]), + (None, None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), + ("8", None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), + ("8", "23.4.0.24.05", ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), ("8", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc8:21.13.0.0"]), - ("9", None, ["com.oracle.database.jdbc:ojdbc8:23.3.0.23.09"]), + ("9", None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), ("9", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc8:21.13.0.0"]), - ("11", None, ["com.oracle.database.jdbc:ojdbc11:23.3.0.23.09"]), + ("11", None, ["com.oracle.database.jdbc:ojdbc11:23.4.0.24.05"]), ("11", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc11:21.13.0.0"]), ("17", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc11:21.13.0.0"]), - ("20", "23.3.0.23.09", ["com.oracle.database.jdbc:ojdbc11:23.3.0.23.09"]), + ("20", "23.4.0.24.05", ["com.oracle.database.jdbc:ojdbc11:23.4.0.24.05"]), ], ) def test_oracle_get_packages(java_version, package_version, expected_packages): From 2c879fcea6ecc8f6690ce7efa824e47180c19291 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 23 May 2024 15:17:16 +0000 Subject: [PATCH 62/71] [DOP-13853] Update MongoDB package to 10.3.0 --- .github/workflows/data/mongodb/matrix.yml | 6 +++--- docs/changelog/next_release/255.feature.rst | 2 +- .../db_connection/mongodb/prerequisites.rst | 3 +-- .../connection/db_connection/mongodb/types.rst | 4 ++-- .../db_connection/mongodb/connection.py | 16 ++++++++-------- .../test_mongodb_unit.py | 18 +++++++++--------- 6 files changed, 24 insertions(+), 25 deletions(-) diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 68c19956d..a07bdd3b7 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -7,7 +7,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.4.3 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -22,12 +22,12 @@ latest: &latest matrix: small: - - mongodb-version: 7.0.8 + - mongodb-version: 7.0.9 <<: *max full: - mongodb-version: 4.0.0 <<: *min - - mongodb-version: 7.0.8 + - mongodb-version: 7.0.9 <<: *max nightly: - mongodb-version: 4.0.0 diff --git a/docs/changelog/next_release/255.feature.rst b/docs/changelog/next_release/255.feature.rst index aff64c57c..4eae087fa 100644 --- a/docs/changelog/next_release/255.feature.rst +++ b/docs/changelog/next_release/255.feature.rst @@ -1 +1 @@ -:class:`MongoDB` connection now uses MongoDB Spark connector ``10.2.3``, upgraded from ``10.1.1``, and supports passing custom versions: ``MongoDB.get_packages(scala_version=..., package_version=...)``. +:class:`MongoDB` connection now uses MongoDB Spark connector ``10.3.0``, upgraded from ``10.1.1``, and supports passing custom versions: ``MongoDB.get_packages(scala_version=..., package_version=...)``. diff --git a/docs/connection/db_connection/mongodb/prerequisites.rst b/docs/connection/db_connection/mongodb/prerequisites.rst index 6bc6e90e9..7df5f5022 100644 --- a/docs/connection/db_connection/mongodb/prerequisites.rst +++ b/docs/connection/db_connection/mongodb/prerequisites.rst @@ -7,8 +7,7 @@ Version Compatibility --------------------- * MongoDB server versions: 4.0 or higher -* Spark versions: 3.2.x - 3.4.x -* Scala versions: 2.12 - 2.13 +* Spark versions: 3.2.x - 3.5.x * Java versions: 8 - 20 See `official documentation `_. diff --git a/docs/connection/db_connection/mongodb/types.rst b/docs/connection/db_connection/mongodb/types.rst index f9787ff2e..2a0231643 100644 --- a/docs/connection/db_connection/mongodb/types.rst +++ b/docs/connection/db_connection/mongodb/types.rst @@ -73,8 +73,8 @@ References Here you can find source code with type conversions: -* `MongoDB -> Spark `_ -* `Spark -> MongoDB `_ +* `MongoDB -> Spark `_ +* `Spark -> MongoDB `_ Supported types --------------- diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index f2c959295..690bd6d4b 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -50,8 +50,8 @@ class Config: class MongoDB(DBConnection): """MongoDB connection. |support_hooks| - Based on package `org.mongodb.spark:mongo-spark-connector:10.2.3 `_ - (`MongoDB connector for Spark `_) + Based on package `org.mongodb.spark:mongo-spark-connector:10.3.0 `_ + (`MongoDB connector for Spark `_) .. warning:: @@ -149,7 +149,7 @@ def get_packages( Spark version in format ``major.minor``. Used only if ``scala_version=None``. package_version : str, optional - Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.2.3``. + Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.3.0``. Examples -------- @@ -160,10 +160,10 @@ def get_packages( MongoDB.get_packages(scala_version="2.12") # specify custom connector version - MongoDB.get_packages(scala_version="2.12", package_version="10.2.3") + MongoDB.get_packages(scala_version="2.12", package_version="10.3.0") """ - default_package_version = "10.2.3" + default_package_version = "10.3.0" if scala_version: scala_ver = Version(scala_version).min_digits(2) @@ -190,7 +190,7 @@ def package_spark_3_2(cls) -> str: "use `MongoDB.get_packages(spark_version='3.2')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" @classproperty def package_spark_3_3(cls) -> str: @@ -200,7 +200,7 @@ def package_spark_3_3(cls) -> str: "use `MongoDB.get_packages(spark_version='3.3')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" @classproperty def package_spark_3_4(cls) -> str: @@ -210,7 +210,7 @@ def package_spark_3_4(cls) -> str: "use `MongoDB.get_packages(spark_version='3.4')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" @slot def pipeline( diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index 3e5f85215..f494e3deb 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -12,9 +12,9 @@ def test_mongodb_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MongoDB.get_packages(spark_version=") with pytest.warns(UserWarning, match=warning_msg): - assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" - assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" - assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3" + assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" + assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" + assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" def test_mongodb_get_packages_no_input(): @@ -50,16 +50,16 @@ def test_mongodb_get_packages_scala_version_not_supported(scala_version): @pytest.mark.parametrize( "spark_version, scala_version, package_version, package", [ - (None, "2.12", "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), - (None, "2.13", "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.13:10.2.3"), - ("3.2", None, "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), - ("3.3", None, "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), - ("3.4", None, "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), + (None, "2.12", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + (None, "2.13", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.13:10.3.0"), + ("3.2", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + ("3.3", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + ("3.4", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), ("3.2", "2.12", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), ("3.4", "2.13", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), ("3.2", "2.12", "10.2.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1"), ("3.2", "2.12", "10.2.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0"), - ("3.2.4", "2.12.1", "10.2.3", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.3"), + ("3.2.4", "2.12.1", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), ], ) def test_mongodb_get_packages(spark_version, scala_version, package_version, package): From 8a2cbd948f17534995c89eb93aa6a3f22e8427e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 23 May 2024 14:51:06 +0000 Subject: [PATCH 63/71] [DOP-16174] Small documentation fixes --- docs/changelog/0.10.0.rst | 4 +- docs/changelog/0.8.0.rst | 6 --- docs/changelog/0.9.0.rst | 50 +++++++++---------- .../db_connection/clickhouse/types.rst | 10 ++-- .../db_connection/greenplum/types.rst | 6 +-- docs/connection/db_connection/mssql/types.rst | 6 +-- docs/connection/db_connection/mysql/types.rst | 6 +-- .../connection/db_connection/oracle/types.rst | 8 +-- .../db_connection/postgres/types.rst | 8 +-- .../db_connection/clickhouse/connection.py | 2 +- .../db_connection/clickhouse/options.py | 10 ++-- .../db_connection/greenplum/connection.py | 2 +- .../db_connection/greenplum/options.py | 6 +-- .../db_connection/hive/connection.py | 2 +- .../db_connection/kafka/connection.py | 2 +- .../db_connection/mongodb/connection.py | 2 +- .../db_connection/mssql/connection.py | 4 +- .../connection/db_connection/mssql/options.py | 10 ++-- .../db_connection/mysql/connection.py | 2 +- .../connection/db_connection/mysql/options.py | 10 ++-- .../db_connection/oracle/connection.py | 2 +- .../db_connection/oracle/options.py | 10 ++-- .../db_connection/postgres/connection.py | 4 +- .../db_connection/postgres/options.py | 10 ++-- .../db_connection/teradata/connection.py | 4 +- .../db_connection/teradata/options.py | 10 ++-- .../spark_hdfs/connection.py | 2 +- .../file_df_connection/spark_s3/connection.py | 2 +- onetl/db/db_reader/db_reader.py | 22 +------- onetl/db/db_writer/db_writer.py | 6 +-- onetl/file/format/avro.py | 1 - onetl/file/format/excel.py | 3 +- onetl/file/format/xml.py | 3 +- 33 files changed, 103 insertions(+), 132 deletions(-) diff --git a/docs/changelog/0.10.0.rst b/docs/changelog/0.10.0.rst index e546d150c..189a2457f 100644 --- a/docs/changelog/0.10.0.rst +++ b/docs/changelog/0.10.0.rst @@ -44,7 +44,7 @@ Breaking Changes - ``HWM`` classes used by previous onETL versions were moved from ``etl_entities`` to ``etl_entities.old_hwm`` submodule. They are here for compatibility reasons, but are planned to be removed in ``etl-entities`` v3 release. - New ``HWM`` classes have flat structure instead of nested. - New ``HWM`` classes have mandatory ``name`` attribute (it was known as ``qualified_name`` before). - - Type aliases used while serializing and deserializing ``HWM`` objects to ``dict`` representation were changed too: ``int`` -> ``column_int``. + - Type aliases used while serializing and deserializing ``HWM`` objects to ``dict`` representation were changed too: ``int`` → ``column_int``. To make migration simpler, you can use new method: @@ -53,7 +53,7 @@ Breaking Changes old_hwm = OldIntHWM(...) new_hwm = old_hwm.as_new_hwm() - Which automatically converts all fields from old structure to new one, including ``qualified_name`` -> ``name``. + Which automatically converts all fields from old structure to new one, including ``qualified_name`` → ``name``. - **Breaking changes:** diff --git a/docs/changelog/0.8.0.rst b/docs/changelog/0.8.0.rst index 52295a4d2..0e82a1cb5 100644 --- a/docs/changelog/0.8.0.rst +++ b/docs/changelog/0.8.0.rst @@ -62,8 +62,6 @@ Breaking Changes from onetl.core import DBWriter from onetl.core import FileDownloader from onetl.core import FileUploader - from onetl.core import FileResult - from onetl.core import FileSet with new modules ``onetl.db`` and ``onetl.file``: @@ -76,10 +74,6 @@ Breaking Changes from onetl.file import FileDownloader from onetl.file import FileUploader - # not a public interface - from onetl.file.file_result import FileResult - from onetl.file.file_set import FileSet - Imports from old module ``onetl.core`` still can be used, but marked as deprecated. Module will be removed in v1.0.0. (:github:pull:`46`) diff --git a/docs/changelog/0.9.0.rst b/docs/changelog/0.9.0.rst index 24927f945..ddd561e5f 100644 --- a/docs/changelog/0.9.0.rst +++ b/docs/changelog/0.9.0.rst @@ -6,43 +6,43 @@ Breaking Changes - Rename methods: - * ``DBConnection.read_df`` -> ``DBConnection.read_source_as_df`` - * ``DBConnection.write_df`` -> ``DBConnection.write_df_to_target`` (:github:pull:`66`) + * ``DBConnection.read_df`` → ``DBConnection.read_source_as_df`` + * ``DBConnection.write_df`` → ``DBConnection.write_df_to_target`` (:github:pull:`66`) - Rename classes: - * ``HDFS.slots`` -> ``HDFS.Slots`` - * ``Hive.slots`` -> ``Hive.Slots`` + * ``HDFS.slots`` → ``HDFS.Slots`` + * ``Hive.slots`` → ``Hive.Slots`` Old names are left intact, but will be removed in v1.0.0 (:github:pull:`103`) - Rename options to make them self-explanatory: - * ``Hive.WriteOptions(mode="append")`` -> ``Hive.WriteOptions(if_exists="append")`` - * ``Hive.WriteOptions(mode="overwrite_table")`` -> ``Hive.WriteOptions(if_exists="replace_entire_table")`` - * ``Hive.WriteOptions(mode="overwrite_partitions")`` -> ``Hive.WriteOptions(if_exists="replace_overlapping_partitions")`` + * ``Hive.WriteOptions(mode="append")`` → ``Hive.WriteOptions(if_exists="append")`` + * ``Hive.WriteOptions(mode="overwrite_table")`` → ``Hive.WriteOptions(if_exists="replace_entire_table")`` + * ``Hive.WriteOptions(mode="overwrite_partitions")`` → ``Hive.WriteOptions(if_exists="replace_overlapping_partitions")`` - * ``JDBC.WriteOptions(mode="append")`` -> ``JDBC.WriteOptions(if_exists="append")`` - * ``JDBC.WriteOptions(mode="overwrite")`` -> ``JDBC.WriteOptions(if_exists="replace_entire_table")`` + * ``JDBC.WriteOptions(mode="append")`` → ``JDBC.WriteOptions(if_exists="append")`` + * ``JDBC.WriteOptions(mode="overwrite")`` → ``JDBC.WriteOptions(if_exists="replace_entire_table")`` - * ``Greenplum.WriteOptions(mode="append")`` -> ``Greenplum.WriteOptions(if_exists="append")`` - * ``Greenplum.WriteOptions(mode="overwrite")`` -> ``Greenplum.WriteOptions(if_exists="replace_entire_table")`` + * ``Greenplum.WriteOptions(mode="append")`` → ``Greenplum.WriteOptions(if_exists="append")`` + * ``Greenplum.WriteOptions(mode="overwrite")`` → ``Greenplum.WriteOptions(if_exists="replace_entire_table")`` - * ``MongoDB.WriteOptions(mode="append")`` -> ``Greenplum.WriteOptions(if_exists="append")`` - * ``MongoDB.WriteOptions(mode="overwrite")`` -> ``Greenplum.WriteOptions(if_exists="replace_entire_collection")`` + * ``MongoDB.WriteOptions(mode="append")`` → ``Greenplum.WriteOptions(if_exists="append")`` + * ``MongoDB.WriteOptions(mode="overwrite")`` → ``Greenplum.WriteOptions(if_exists="replace_entire_collection")`` - * ``FileDownloader.Options(mode="error")`` -> ``FileDownloader.Options(if_exists="error")`` - * ``FileDownloader.Options(mode="ignore")`` -> ``FileDownloader.Options(if_exists="ignore")`` - * ``FileDownloader.Options(mode="overwrite")`` -> ``FileDownloader.Options(if_exists="replace_file")`` - * ``FileDownloader.Options(mode="delete_all")`` -> ``FileDownloader.Options(if_exists="replace_entire_directory")`` + * ``FileDownloader.Options(mode="error")`` → ``FileDownloader.Options(if_exists="error")`` + * ``FileDownloader.Options(mode="ignore")`` → ``FileDownloader.Options(if_exists="ignore")`` + * ``FileDownloader.Options(mode="overwrite")`` → ``FileDownloader.Options(if_exists="replace_file")`` + * ``FileDownloader.Options(mode="delete_all")`` → ``FileDownloader.Options(if_exists="replace_entire_directory")`` - * ``FileUploader.Options(mode="error")`` -> ``FileUploader.Options(if_exists="error")`` - * ``FileUploader.Options(mode="ignore")`` -> ``FileUploader.Options(if_exists="ignore")`` - * ``FileUploader.Options(mode="overwrite")`` -> ``FileUploader.Options(if_exists="replace_file")`` - * ``FileUploader.Options(mode="delete_all")`` -> ``FileUploader.Options(if_exists="replace_entire_directory")`` + * ``FileUploader.Options(mode="error")`` → ``FileUploader.Options(if_exists="error")`` + * ``FileUploader.Options(mode="ignore")`` → ``FileUploader.Options(if_exists="ignore")`` + * ``FileUploader.Options(mode="overwrite")`` → ``FileUploader.Options(if_exists="replace_file")`` + * ``FileUploader.Options(mode="delete_all")`` → ``FileUploader.Options(if_exists="replace_entire_directory")`` - * ``FileMover.Options(mode="error")`` -> ``FileMover.Options(if_exists="error")`` - * ``FileMover.Options(mode="ignore")`` -> ``FileMover.Options(if_exists="ignore")`` - * ``FileMover.Options(mode="overwrite")`` -> ``FileMover.Options(if_exists="replace_file")`` - * ``FileMover.Options(mode="delete_all")`` -> ``FileMover.Options(if_exists="replace_entire_directory")`` + * ``FileMover.Options(mode="error")`` → ``FileMover.Options(if_exists="error")`` + * ``FileMover.Options(mode="ignore")`` → ``FileMover.Options(if_exists="ignore")`` + * ``FileMover.Options(mode="overwrite")`` → ``FileMover.Options(if_exists="replace_file")`` + * ``FileMover.Options(mode="delete_all")`` → ``FileMover.Options(if_exists="replace_entire_directory")`` Old names are left intact, but will be removed in v1.0.0 (:github:pull:`108`) - Rename ``onetl.log.disable_clients_logging()`` to ``onetl.log.setup_clients_logging()``. (:github:pull:`120`) diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index ca024d36a..19f052d53 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -14,7 +14,7 @@ Reading from Clickhouse This is how Clickhouse connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Clickhouse type. -* Find corresponding ``Clickhouse type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Clickhouse type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. Writing to some existing Clickhouse table @@ -25,8 +25,8 @@ This is how Clickhouse connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. -* **Find corresponding** ``Clickhouse type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ -* Find corresponding ``Spark type`` -> ``Clickhousetype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* **Find corresponding** ``Clickhouse type (read)`` → ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ +* Find corresponding ``Spark type`` → ``Clickhousetype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Clickhousetype (write)`` match ``Clickhouse type (read)``, no additional casts will be performed, DataFrame column will be written to Clickhouse as is. * If ``Clickhousetype (write)`` does not match ``Clickhouse type (read)``, DataFrame column will be casted to target column type **on Clickhouse side**. For example, you can write column with text data to ``Int32`` column, if column contains valid integer values within supported value range and precision. @@ -47,7 +47,7 @@ Create new table using Spark This is how Clickhouse connector performs this: -* Find corresponding ``Spark type`` -> ``Clickhouse type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Clickhouse type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in Clickhouse, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. @@ -218,7 +218,7 @@ Notes: +-----------------------------------+ | | | | ``DateTime64(P, TZ)`` | | | | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``IntervalNanosecond`` | ``LongType()`` | ``Int64`` | ``Int64`` | +| ``IntervalNanosecond`` | ``LongType()`` | ``Int64`` | ``Int64`` | +-----------------------------------+ | | | | ``IntervalMicrosecond`` | | | | +-----------------------------------+ | | | diff --git a/docs/connection/db_connection/greenplum/types.rst b/docs/connection/db_connection/greenplum/types.rst index 63e7b9a2b..9199467e2 100644 --- a/docs/connection/db_connection/greenplum/types.rst +++ b/docs/connection/db_connection/greenplum/types.rst @@ -15,7 +15,7 @@ This is how Greenplum connector performs this: * Execute query ``SELECT * FROM table LIMIT 0`` [1]_. * For each column in query result get column name and Greenplum type. -* Find corresponding ``Greenplum type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Greenplum type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Use Spark column projection and predicate pushdown features to build a final query. * Create DataFrame from generated query with inferred schema. @@ -34,7 +34,7 @@ This is how Greenplum connector performs this: * Match table columns with DataFrame columns (by name, case insensitive). If some column is present only in target table, but not in DataFrame (like ``DEFAULT`` or ``SERIAL`` column), and vice versa, raise an exception. See `Explicit type cast`_. -* Find corresponding ``Spark type`` -> ``Greenplumtype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Greenplumtype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Greenplumtype (write)`` match ``Greenplum type (read)``, no additional casts will be performed, DataFrame column will be written to Greenplum as is. * If ``Greenplumtype (write)`` does not match ``Greenplum type (read)``, DataFrame column will be casted to target column type **on Greenplum side**. For example, you can write column with text data to ``json`` column which Greenplum connector currently does not support. @@ -47,7 +47,7 @@ Create new table using Spark This is how Greenplum connector performs this: -* Find corresponding ``Spark type`` -> ``Greenplum type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Greenplum type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in Greenplum, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. diff --git a/docs/connection/db_connection/mssql/types.rst b/docs/connection/db_connection/mssql/types.rst index ea7fd7102..1052143ed 100644 --- a/docs/connection/db_connection/mssql/types.rst +++ b/docs/connection/db_connection/mssql/types.rst @@ -14,7 +14,7 @@ Reading from MSSQL This is how MSSQL connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and MSSQL type. -* Find corresponding ``MSSQL type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``MSSQL type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. Writing to some existing MSSQL table @@ -25,7 +25,7 @@ This is how MSSQL connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive). For each found column get MSSQL type. -* Find corresponding ``Spark type`` -> ``MSSQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``MSSQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``MSSQL type (write)`` match ``MSSQL type (read)``, no additional casts will be performed, DataFrame column will be written to MSSQL as is. * If ``MSSQL type (write)`` does not match ``MSSQL type (read)``, DataFrame column will be casted to target column type **on MSSQL side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision [2]_. @@ -48,7 +48,7 @@ Create new table using Spark This is how MSSQL connector performs this: -* Find corresponding ``Spark type`` -> ``MSSQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``MSSQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in MSSQL, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. diff --git a/docs/connection/db_connection/mysql/types.rst b/docs/connection/db_connection/mysql/types.rst index a5de62569..f3fca30a4 100644 --- a/docs/connection/db_connection/mysql/types.rst +++ b/docs/connection/db_connection/mysql/types.rst @@ -14,7 +14,7 @@ Reading from MySQL This is how MySQL connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and MySQL type. -* Find corresponding ``MySQL type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``MySQL type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. Writing to some existing MySQL table @@ -25,7 +25,7 @@ This is how MySQL connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive). For each found column get MySQL type. -* Find corresponding ``Spark type`` -> ``MySQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``MySQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``MySQL type (write)`` match ``MySQL type (read)``, no additional casts will be performed, DataFrame column will be written to MySQL as is. * If ``MySQL type (write)`` does not match ``MySQL type (read)``, DataFrame column will be casted to target column type **on MySQL side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision. @@ -42,7 +42,7 @@ Create new table using Spark This is how MySQL connector performs this: -* Find corresponding ``Spark type`` -> ``MySQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``MySQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in MySQL, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst index 330a460a8..2c6116ce4 100644 --- a/docs/connection/db_connection/oracle/types.rst +++ b/docs/connection/db_connection/oracle/types.rst @@ -14,7 +14,7 @@ Reading from Oracle This is how Oracle connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Oracle type. -* Find corresponding ``Oracle type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Oracle type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. Writing to some existing Oracle table @@ -25,8 +25,8 @@ This is how Oracle connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. -* **Find corresponding** ``Oracle type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ -* Find corresponding ``Spark type`` -> ``Oracle type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* **Find corresponding** ``Oracle type (read)`` → ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ +* Find corresponding ``Spark type`` → ``Oracle type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Oracle type (write)`` match ``Oracle type (read)``, no additional casts will be performed, DataFrame column will be written to Oracle as is. * If ``Oracle type (write)`` does not match ``Oracle type (read)``, DataFrame column will be casted to target column type **on Oracle side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision. @@ -48,7 +48,7 @@ Create new table using Spark This is how Oracle connector performs this: -* Find corresponding ``Spark type`` -> ``Oracle type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Oracle type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in Oracle, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index 94351f234..4214fe63d 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -14,7 +14,7 @@ Reading from Postgres This is how Postgres connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Postgres type. -* Find corresponding ``Postgres type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column [1]_. If no combination is found, raise exception. +* Find corresponding ``Postgres type (read)`` → ``Spark type`` combination (see below) for each DataFrame column [1]_. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. .. [1] @@ -28,7 +28,7 @@ This is how Postgres connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive) [2]_. For each found column get Postgres type. -* Find corresponding ``Spark type`` -> ``Postgres type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Postgres type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Postgres type (write)`` match ``Postgres type (read)``, no additional casts will be performed, DataFrame column will be written to Postgres as is. * If ``Postgres type (write)`` does not match ``Postgres type (read)``, DataFrame column will be casted to target column type **on Postgres side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision [3]_. @@ -51,7 +51,7 @@ Create new table using Spark This is how Postgres connector performs this: -* Find corresponding ``Spark type`` -> ``Postgres type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Postgres type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in Postgres, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. @@ -248,7 +248,7 @@ String types | ``jsonb`` | | | | +-----------------------------+ | | | | ``xml`` | | | | -+-----------------------------+-----------------------| | | ++-----------------------------+-----------------------+ | | | ``CREATE TYPE ... AS ENUM`` | ``StringType()`` [1]_ | | | +-----------------------------+ | | | | ``tsvector`` | | | | diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 3892c5f94..1d8a87fbd 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -38,7 +38,7 @@ class Clickhouse(JDBCConnection): Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0-patch4 `_ (`official Clickhouse JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`clickhouse-prerequisites` diff --git a/onetl/connection/db_connection/clickhouse/options.py b/onetl/connection/db_connection/clickhouse/options.py index 54b4558fb..5e35c9693 100644 --- a/onetl/connection/db_connection/clickhouse/options.py +++ b/onetl/connection/db_connection/clickhouse/options.py @@ -14,20 +14,20 @@ class ClickhouseReadOptions(JDBCReadOptions): - pass + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] class ClickhouseWriteOptions(JDBCWriteOptions): - pass + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] class ClickhouseSQLOptions(JDBCSQLOptions): - pass + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] class ClickhouseFetchOptions(JDBCFetchOptions): - pass + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] class ClickhouseExecuteOptions(JDBCExecuteOptions): - pass + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 77319b3b0..1814e02a1 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -77,7 +77,7 @@ class Greenplum(JDBCMixin, DBConnection): Based on package ``io.pivotal:greenplum-spark:2.2.0`` (`VMware Greenplum connector for Spark `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`greenplum-prerequisites` diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index 65d275af3..cd3b02c41 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -319,12 +319,12 @@ def _mode_is_deprecated(cls, values): class GreenplumSQLOptions(JDBCSQLOptions): - pass + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] class GreenplumFetchOptions(JDBCFetchOptions): - pass + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] class GreenplumExecuteOptions(JDBCExecuteOptions): - pass + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 42f6f2a92..7d56df02d 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -39,7 +39,7 @@ class Hive(DBConnection): """Spark connection with Hive MetaStore support. |support_hooks| - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`hive-prerequisites` diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 0e5895ac1..08a8fbe65 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -57,7 +57,7 @@ class Kafka(DBConnection): Based on `official Kafka Source For Spark `_. - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`kafka-prerequisites` diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 690bd6d4b..e4fa55b1c 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -53,7 +53,7 @@ class MongoDB(DBConnection): Based on package `org.mongodb.spark:mongo-spark-connector:10.3.0 `_ (`MongoDB connector for Spark `_) - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`mongodb-prerequisites` diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 04add7157..db53835e3 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -32,11 +32,11 @@ class Config: class MSSQL(JDBCConnection): """MSSQL JDBC connection. |support_hooks| - Based on Maven package ``com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8`` + Based on Maven package `com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8 `_ (`official MSSQL JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`mssql-prerequisites` diff --git a/onetl/connection/db_connection/mssql/options.py b/onetl/connection/db_connection/mssql/options.py index 5e5a9d11a..c14e38b68 100644 --- a/onetl/connection/db_connection/mssql/options.py +++ b/onetl/connection/db_connection/mssql/options.py @@ -13,20 +13,20 @@ class MSSQLReadOptions(JDBCReadOptions): - pass + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] class MSSQLWriteOptions(JDBCWriteOptions): - pass + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] class MSSQLSQLOptions(JDBCSQLOptions): - pass + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] class MSSQLFetchOptions(JDBCFetchOptions): - pass + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] class MSSQLExecuteOptions(JDBCExecuteOptions): - pass + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index a80bd7ac9..ff21a3be2 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -37,7 +37,7 @@ class MySQL(JDBCConnection): Based on Maven package `com.mysql:mysql-connector-j:8.4.0 `_ (`official MySQL JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`mysql-prerequisites` diff --git a/onetl/connection/db_connection/mysql/options.py b/onetl/connection/db_connection/mysql/options.py index b2bb2b9d3..06abd6d2d 100644 --- a/onetl/connection/db_connection/mysql/options.py +++ b/onetl/connection/db_connection/mysql/options.py @@ -14,20 +14,20 @@ class MySQLReadOptions(JDBCReadOptions): - pass + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] class MySQLWriteOptions(JDBCWriteOptions): - pass + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] class MySQLSQLOptions(JDBCSQLOptions): - pass + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] class MySQLFetchOptions(JDBCFetchOptions): - pass + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] class MySQLExecuteOptions(JDBCExecuteOptions): - pass + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 612cfd9e5..f07fa8c95 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -84,7 +84,7 @@ class Oracle(JDBCConnection): Based on Maven package `com.oracle.database.jdbc:ojdbc8:23.4.0.24.05 `_ (`official Oracle JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`oracle-prerequisites` diff --git a/onetl/connection/db_connection/oracle/options.py b/onetl/connection/db_connection/oracle/options.py index 2e0b4f9c0..61b82e1b1 100644 --- a/onetl/connection/db_connection/oracle/options.py +++ b/onetl/connection/db_connection/oracle/options.py @@ -14,20 +14,20 @@ class OracleReadOptions(JDBCReadOptions): - pass + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] class OracleWriteOptions(JDBCWriteOptions): - pass + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] class OracleSQLOptions(JDBCSQLOptions): - pass + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] class OracleFetchOptions(JDBCFetchOptions): - pass + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] class OracleExecuteOptions(JDBCExecuteOptions): - pass + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index e78d175a8..3bcac2eb9 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -43,10 +43,10 @@ class Config: class Postgres(JDBCConnection): """PostgreSQL JDBC connection. |support_hooks| - Based on Maven package ``org.postgresql:postgresql:42.7.3`` + Based on Maven package `org.postgresql:postgresql:42.7.3 `_ (`official Postgres JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`postgres-prerequisites` diff --git a/onetl/connection/db_connection/postgres/options.py b/onetl/connection/db_connection/postgres/options.py index 4f7aecd1d..3a4dd806f 100644 --- a/onetl/connection/db_connection/postgres/options.py +++ b/onetl/connection/db_connection/postgres/options.py @@ -13,20 +13,20 @@ class PostgresReadOptions(JDBCReadOptions): - pass + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] class PostgresWriteOptions(JDBCWriteOptions): - pass + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] class PostgresSQLOptions(JDBCSQLOptions): - pass + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] class PostgresFetchOptions(JDBCFetchOptions): - pass + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] class PostgresExecuteOptions(JDBCExecuteOptions): - pass + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 3f7f884e3..fcffe1772 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -38,10 +38,10 @@ class Config: class Teradata(JDBCConnection): """Teradata JDBC connection. |support_hooks| - Based on package ``com.teradata.jdbc:terajdbc:17.20.00.15`` + Based on package `com.teradata.jdbc:terajdbc:17.20.00.15 `_ (`official Teradata JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`teradata-prerequisites` diff --git a/onetl/connection/db_connection/teradata/options.py b/onetl/connection/db_connection/teradata/options.py index c71592c47..eb77f8c87 100644 --- a/onetl/connection/db_connection/teradata/options.py +++ b/onetl/connection/db_connection/teradata/options.py @@ -13,20 +13,20 @@ class TeradataReadOptions(JDBCReadOptions): - pass + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] class TeradataWriteOptions(JDBCWriteOptions): - pass + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] class TeradataSQLOptions(JDBCSQLOptions): - pass + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] class TeradataFetchOptions(JDBCFetchOptions): - pass + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] class TeradataExecuteOptions(JDBCExecuteOptions): - pass + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 677ffe3bd..604e70984 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -37,7 +37,7 @@ class SparkHDFS(SparkFileDFConnection): Based on `Spark Generic File Data Source `_. - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`spark-hdfs-prerequisites` diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index eaec3574f..b71a9ad12 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -53,7 +53,7 @@ class SparkS3(SparkFileDFConnection): Based on `Hadoop-AWS module `_ and `Spark integration with Cloud Infrastructures `_. - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`spark-s3-prerequisites` diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index af038c24f..dff63de7c 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -58,20 +58,8 @@ class DBReader(FrozenModel): .. note:: - This class operates with only one table at a time. It does NOT support executing JOINs. - - To get the JOIN result you can instead: - - 1. Use 2 instandes of DBReader with different tables, - call :obj:`~run` of each one to get a table dataframe, - and then use ``df1.join(df2)`` syntax (Hive) - - 2. Use ``connection.execute("INSERT INTO ... SELECT ... JOIN ...")`` - to execute JOIN on RDBMS side, write the result into a temporary table, - and then use DBReader to get the data from this temporary table (MPP systems, like Greenplum) - - 3. Use ``connection.sql(query)`` method to pass SQL query with a JOIN, - and fetch the result (other RDBMS) + This class operates with only one source at a time. It does NOT support executing queries + to multiple source, like ``SELECT ... JOIN``. Parameters ---------- @@ -617,12 +605,6 @@ def run(self) -> DataFrame: df : pyspark.sql.dataframe.DataFrame Spark dataframe - .. note:: - - Keep in mind that with differences in the timezone settings of the source and Spark, - there may be discrepancies in the datetime on the source and in the Spark dataframe. - It depends on the ``spark.sql.session.timeZone`` option set when creating the Spark session. - Examples -------- diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index eb83c9eb8..81bc070d6 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -42,7 +42,7 @@ class DBWriter(FrozenModel): including the schema, e.g. ``schema.name``. options : dict, :obj:`onetl.connection.DBConnection.WriteOptions`, default: ``None`` - Spark write options. + Spark write options. Can be in form of special ``WriteOptions`` object or a dict. For example: ``{"if_exists": "replace_entire_table", "compression": "snappy"}`` @@ -107,9 +107,7 @@ class DBWriter(FrozenModel): spark=spark, ) - options = {"truncate": "true", "batchsize": 1000} - # or (it is the same): - options = Postgres.WriteOptions(truncate=True, batchsize=1000) + options = Postgres.WriteOptions(if_exists="replace_entire_table", batchsize=1000) writer = DBWriter( connection=postgres, diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 7cfae4408..305603791 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -65,7 +65,6 @@ class Avro(ReadWriteFileFormat): * Spark versions: 2.4.x - 3.5.x * Java versions: 8 - 20 - * Scala versions: 2.11 - 2.13 See documentation from link above. diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index a3ea7a015..ee7bfe74c 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -57,7 +57,7 @@ class Excel(ReadWriteFileFormat): .. dropdown:: Version compatibility - * Spark versions: 3.2.x - 3.5.x. + * Spark versions: 3.2.x - 3.5.x .. warning:: @@ -65,7 +65,6 @@ class Excel(ReadWriteFileFormat): See `Maven index `_ and `official documentation `_. - * Scala versions: 2.12 - 2.13 * Java versions: 8 - 20 See documentation from link above. diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 1c1f954eb..5165adfca 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -88,8 +88,7 @@ class XML(ReadWriteFileFormat): .. dropdown:: Version compatibility - * Spark versions: 3.2.x - 3.5.x. - * Scala versions: 2.12 - 2.13 + * Spark versions: 3.2.x - 3.5.x * Java versions: 8 - 20 See documentation from link above. From 7431bdf668f12b836074a99c6f86e675f18a8f1b Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Fri, 24 May 2024 17:41:46 +0300 Subject: [PATCH 64/71] update clickhouse docs --- docs/connection/db_connection/clickhouse/types.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 19f052d53..00a71551a 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -329,7 +329,7 @@ Columns of these Clickhouse types cannot be read by Spark: * ``SimpleAggregateFunction(func, T)`` * ``Tuple(T1, T2, ...)`` -Dataframe with these Spark types be written to Clickhouse: +Dataframe with these Spark types cannot be written to Clickhouse: * ``ArrayType(T)`` * ``BinaryType()`` * ``CharType(N)`` @@ -365,6 +365,7 @@ For parsing JSON columns in ClickHouse, :obj:`JSON.parse_column Date: Fri, 24 May 2024 20:49:58 +0000 Subject: [PATCH 65/71] [DOP-13852] Update MSSQL package to 12.6.2 --- docs/changelog/next_release/254.feature.rst | 2 +- .../db_connection/mssql/connection.py | 10 +++++----- .../test_mssql_unit.py | 20 +++++++++---------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/changelog/next_release/254.feature.rst b/docs/changelog/next_release/254.feature.rst index 0a8aff089..9b331b51d 100644 --- a/docs/changelog/next_release/254.feature.rst +++ b/docs/changelog/next_release/254.feature.rst @@ -1 +1 @@ -:class:`MSSQL` connection now uses Microsoft SQL Server JDBC driver ``12.6.1``, upgraded from ``12.2.0``, and supports passing custom versions: ``MSSQL.get_packages(java_version=..., package_version=...)``. +:class:`MSSQL` connection now uses Microsoft SQL Server JDBC driver ``12.6.2``, upgraded from ``12.2.0``, and supports passing custom versions: ``MSSQL.get_packages(java_version=..., package_version=...)``. diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index db53835e3..5fd50aa0f 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -32,7 +32,7 @@ class Config: class MSSQL(JDBCConnection): """MSSQL JDBC connection. |support_hooks| - Based on Maven package `com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8 `_ + Based on Maven package `com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8 `_ (`official MSSQL JDBC driver `_). @@ -173,7 +173,7 @@ def get_packages( java_version : str, optional Java major version, defaults to ``8``. Must be ``8`` or ``11``. package_version : str, optional - Specifies the version of the MSSQL JDBC driver to use. Defaults to ``12.6.1.``. + Specifies the version of the MSSQL JDBC driver to use. Defaults to ``12.6.2.``. Examples -------- @@ -184,10 +184,10 @@ def get_packages( MSSQL.get_packages() # specify Java and package versions - MSSQL.get_packages(java_version="8", package_version="12.6.1.jre11") + MSSQL.get_packages(java_version="8", package_version="12.6.2.jre11") """ default_java_version = "8" - default_package_version = "12.6.1" + default_package_version = "12.6.2" java_ver = Version(java_version or default_java_version) if java_ver.major < 8: @@ -209,7 +209,7 @@ def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`MSSQL.package` will be removed in 1.0.0, use `MSSQL.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8" + return "com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8" @property def jdbc_url(self) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py index e1069c20e..3e3f81496 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py @@ -14,23 +14,23 @@ def test_mssql_class_attributes(): def test_mssql_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MSSQL.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert MSSQL.package == "com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8" + assert MSSQL.package == "com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8" @pytest.mark.parametrize( "java_version, package_version, expected_packages", [ - (None, None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), - ("8", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), - ("9", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), - ("11", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre11"]), - ("20", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre11"]), - ("8", "12.6.1.jre8", ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), - ("11", "12.6.1.jre11", ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre11"]), + (None, None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("8", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("9", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("11", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), + ("20", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), + ("8", "12.6.2.jre8", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("11", "12.6.2.jre11", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), ("11", "12.7.0.jre11-preview", ["com.microsoft.sqlserver:mssql-jdbc:12.7.0.jre11-preview"]), ("8", "12.7.0.jre8-preview", ["com.microsoft.sqlserver:mssql-jdbc:12.7.0.jre8-preview"]), - ("8", "12.6.1", ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre8"]), - ("11", "12.6.1", ["com.microsoft.sqlserver:mssql-jdbc:12.6.1.jre11"]), + ("8", "12.6.2", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("11", "12.6.2", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), ], ) def test_mssql_get_packages(java_version, package_version, expected_packages): From f0025a415539fc9565710f8eb72dd0af6acd964e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 27 May 2024 06:50:51 +0000 Subject: [PATCH 66/71] [DOP-13855] Update Clickhouse package to 0.6.0-patch5 --- docs/changelog/next_release/249.breaking.rst | 2 +- onetl/connection/db_connection/clickhouse/connection.py | 8 ++++---- .../tests_db_connection_unit/test_clickhouse_unit.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/changelog/next_release/249.breaking.rst b/docs/changelog/next_release/249.breaking.rst index ca733c81f..1069a5324 100644 --- a/docs/changelog/next_release/249.breaking.rst +++ b/docs/changelog/next_release/249.breaking.rst @@ -1 +1 @@ -Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0-patch4 `_. +Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0-patch5 `_. diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 1d8a87fbd..3e75d768e 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -35,7 +35,7 @@ class Config: class Clickhouse(JDBCConnection): """Clickhouse JDBC connection. |support_hooks| - Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0-patch4 `_ + Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0-patch5 `_ (`official Clickhouse JDBC driver `_). .. seealso:: @@ -132,7 +132,7 @@ def get_packages( Parameters ---------- package_version : str, optional - ClickHouse JDBC version client packages. Defaults to ``0.6.0-patch4``. + ClickHouse JDBC version client packages. Defaults to ``0.6.0-patch5``. apache_http_client_version : str, optional Apache HTTP Client version package. Defaults to ``5.3.1``. @@ -152,7 +152,7 @@ def get_packages( ``com.clickhouse:clickhouse-jdbc:0.6.0:all`` to install all required packages. """ - default_jdbc_version = "0.6.0-patch4" + default_jdbc_version = "0.6.0-patch5" default_http_version = "5.3.1" jdbc_version = Version(package_version or default_jdbc_version).min_digits(3) @@ -171,7 +171,7 @@ def get_packages( @classproperty def package(self) -> str: """Get a single string of package names to be downloaded by Spark for establishing a Clickhouse connection.""" - return "com.clickhouse:clickhouse-jdbc:0.6.0-patch4,com.clickhouse:clickhouse-http-client:0.6.0-patch4,org.apache.httpcomponents.client5:httpclient5:5.3.1" + return "com.clickhouse:clickhouse-jdbc:0.6.0-patch5,com.clickhouse:clickhouse-http-client:0.6.0-patch5,org.apache.httpcomponents.client5:httpclient5:5.3.1" @property def jdbc_url(self) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 29478b6c9..4ce55a572 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -11,7 +11,7 @@ def test_clickhouse_driver(): def test_clickhouse_package(): expected_packages = ( - "com.clickhouse:clickhouse-jdbc:0.6.0-patch4,com.clickhouse:clickhouse-http-client:0.6.0-patch4," + "com.clickhouse:clickhouse-jdbc:0.6.0-patch5,com.clickhouse:clickhouse-http-client:0.6.0-patch5," "org.apache.httpcomponents.client5:httpclient5:5.3.1" ) assert Clickhouse.package == expected_packages @@ -24,8 +24,8 @@ def test_clickhouse_package(): None, None, [ - "com.clickhouse:clickhouse-jdbc:0.6.0-patch4", - "com.clickhouse:clickhouse-http-client:0.6.0-patch4", + "com.clickhouse:clickhouse-jdbc:0.6.0-patch5", + "com.clickhouse:clickhouse-http-client:0.6.0-patch5", "org.apache.httpcomponents.client5:httpclient5:5.3.1", ], ), From 54e87343463c1f7197df55a6c93cb484d6571646 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 27 May 2024 09:00:38 +0000 Subject: [PATCH 67/71] [DOP-16174] Prepare for release --- docs/changelog/0.11.0.rst | 227 ++++++++++++++++++ docs/changelog/0.7.0.rst | 8 +- docs/changelog/index.rst | 1 + docs/changelog/next_release/+.bugfix.rst | 1 - docs/changelog/next_release/249.breaking.rst | 1 - docs/changelog/next_release/249.feature.rst | 1 - docs/changelog/next_release/251.feature.rst | 1 - docs/changelog/next_release/252.feature.rst | 1 - docs/changelog/next_release/253.feature.rst | 1 - docs/changelog/next_release/254.feature.rst | 1 - docs/changelog/next_release/255.feature.rst | 1 - docs/changelog/next_release/256.feature.rst | 1 - docs/changelog/next_release/257.feature.rst | 1 - docs/changelog/next_release/258.feature.rst | 1 - docs/changelog/next_release/259.feature.rst | 1 - docs/changelog/next_release/265.feature.rst | 1 - docs/changelog/next_release/267.breaking.rst | 26 -- docs/changelog/next_release/268.feature.rst | 1 - docs/changelog/next_release/269.feature.rst | 1 - docs/changelog/next_release/272.feature.rst | 47 ---- .../next_release/273.improvement.rst | 1 - docs/changelog/next_release/274.feature.rst | 37 --- .../next_release/275.improvement.rst | 1 - .../next_release/276.improvement.rst | 4 - .../next_release/278.improvement.rst | 4 - .../next_release/279.improvement.rst | 1 - .../next_release/280.improvement.rst | 1 - 27 files changed, 232 insertions(+), 141 deletions(-) create mode 100644 docs/changelog/0.11.0.rst delete mode 100644 docs/changelog/next_release/+.bugfix.rst delete mode 100644 docs/changelog/next_release/249.breaking.rst delete mode 100644 docs/changelog/next_release/249.feature.rst delete mode 100644 docs/changelog/next_release/251.feature.rst delete mode 100644 docs/changelog/next_release/252.feature.rst delete mode 100644 docs/changelog/next_release/253.feature.rst delete mode 100644 docs/changelog/next_release/254.feature.rst delete mode 100644 docs/changelog/next_release/255.feature.rst delete mode 100644 docs/changelog/next_release/256.feature.rst delete mode 100644 docs/changelog/next_release/257.feature.rst delete mode 100644 docs/changelog/next_release/258.feature.rst delete mode 100644 docs/changelog/next_release/259.feature.rst delete mode 100644 docs/changelog/next_release/265.feature.rst delete mode 100644 docs/changelog/next_release/267.breaking.rst delete mode 100644 docs/changelog/next_release/268.feature.rst delete mode 100644 docs/changelog/next_release/269.feature.rst delete mode 100644 docs/changelog/next_release/272.feature.rst delete mode 100644 docs/changelog/next_release/273.improvement.rst delete mode 100644 docs/changelog/next_release/274.feature.rst delete mode 100644 docs/changelog/next_release/275.improvement.rst delete mode 100644 docs/changelog/next_release/276.improvement.rst delete mode 100644 docs/changelog/next_release/278.improvement.rst delete mode 100644 docs/changelog/next_release/279.improvement.rst delete mode 100644 docs/changelog/next_release/280.improvement.rst diff --git a/docs/changelog/0.11.0.rst b/docs/changelog/0.11.0.rst new file mode 100644 index 000000000..e74ba41fa --- /dev/null +++ b/docs/changelog/0.11.0.rst @@ -0,0 +1,227 @@ +0.11.0 (2024-05-27) +=================== + +Breaking Changes +---------------- + +There can be some changes in connection behavior, related to version upgrades. So we mark these changes as **breaking** although +most of users will not see any differences. + +- Update Clickhouse JDBC driver to latest version (:github:pull:`249`): + * Package was renamed ``ru.yandex.clickhouse:clickhouse-jdbc`` → ``com.clickhouse:clickhouse-jdbc``. + * Package version changed ``0.3.2`` → ``0.6.0-patch5``. + * Driver name changed ``ru.yandex.clickhouse.ClickHouseDriver`` → ``com.clickhouse.jdbc.ClickHouseDriver``. + + This brings up several fixes for Spark <-> Clickhouse type compatibility, and also Clickhouse clusters support. + +- Update other JDBC drivers to latest versions: + * MSSQL ``12.2.0`` → ``12.6.2`` (:github:pull:`254`). + * MySQL ``8.0.33`` → ``8.4.0`` (:github:pull:`253`, :github:pull:`285`). + * Oracle ``23.2.0.0`` → ``23.4.0.24.05`` (:github:pull:`252`, :github:pull:`284`). + * Postgres ``42.6.0`` → ``42.7.3`` (:github:pull:`251`). + +- Update MongoDB connector to latest version: ``10.1.1`` → ``10.3.0`` (:github:pull:`255`, :github:pull:`283`). + + This brings up Spark 3.5 support. + +- Update ``XML`` package to latest version: ``0.17.0`` → ``0.18.0`` (:github:pull:`259`). + + This brings few bugfixes with datetime format handling. + +- Serialize ``ColumnDatetimeHWM`` to Clickhouse's ``DateTime64(6)`` (precision up to microseconds) instead of ``DateTime`` (precision up to seconds) (:github:pull:`267`). + + In previous onETL versions, ``ColumnDatetimeHWM`` value was rounded to the second, and thus reading some rows that were read in previous runs, + producing duplicates. + + For Clickhouse versions below 21.1 comparing column of type ``DateTime`` with a value of type ``DateTime64`` is not supported, returning an empty dataframe. + To avoid this, replace: + + .. code:: python + + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="hwm_column", # <-- + ), + ) + + with: + + .. code:: python + + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="CAST(hwm_column AS DateTime64)", # <-- add explicit CAST + ), + ) + +- Pass JDBC connection extra params as ``properties`` dict instead of URL with query part (:github:pull:`268`). + + This allows passing custom connection parameters like ``Clickhouse(extra={"custom_http_options": "option1=value1,option2=value2"})`` + without need to apply urlencode to parameter value, like ``option1%3Dvalue1%2Coption2%3Dvalue2``. + +- For JDBC connections add new ``SQLOptions`` class for ``DB.sql(query, options=...)`` method (:github:pull:`272`). + + Firsly, to keep naming more consistent. + + Secondly, some of options are not supported by ``DB.sql(...)`` method, but supported by ``DBReader``. + For example, ``SQLOptions`` do not support ``partitioning_mode`` and require explicit definition of ``lower_bound`` and ``upper_bound`` when ``num_partitions`` is greater than 1. + ``ReadOptions`` does support ``partitioning_mode`` and allows skipping ``lower_bound`` and ``upper_bound`` values. + + This require some code changes. Before: + + .. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.ReadOptions( + partitioning_mode="range", + partition_column="id", + num_partitions=10, + ), + ) + + After: + + .. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.SQLOptions( + # partitioning_mode is not supported! + partition_column="id", + num_partitions=10, + lower_bound=0, # <-- set explicitly + upper_bound=1000, # <-- set explicitly + ), + ) + + For now, ``DB.sql(query, options=...)`` can accept ``ReadOptions`` to keep backward compatibility, but emits deprecation warning. + The support will be removed in ``v1.0.0``. + +- Split up ``JDBCOptions`` class into ``FetchOptions`` and ``ExecuteOptions`` (:github:pull:`274`). + + New classes are used by ``DB.fetch(query, options=...)`` and ``DB.execute(query, options=...)`` methods respectively. + This is mostly to keep naming more consistent. + + This require some code changes. Before: + + .. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.JDBCOptions( + fetchsize=1000, + query_timeout=30, + ), + ) + + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.JDBCOptions(query_timeout=30), + ) + + After: + + .. code-block:: python + + from onetl.connection import Postgres + + # Using FetchOptions for fetching data + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.FetchOptions( # <-- change class name + fetchsize=1000, + query_timeout=30, + ), + ) + + # Using ExecuteOptions for executing statements + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.ExecuteOptions(query_timeout=30), # <-- change class name + ) + + For now, ``DB.fetch(query, options=...)`` and ``DB.execute(query, options=...)`` can accept ``JDBCOptions``, to keep backward compatibility, + but emit a deprecation warning. The old class will be removed in ``v1.0.0``. + +Features +-------- + +Improve user experience with Kafka messages and Database tables with serialized columns, like JSON/XML. + +- Allow passing custom package version as argument for ``DB.get_packages(...)`` method of several DB connectors: + * ``Clickhouse.get_packages(package_version=..., apache_http_client_version=...)`` (:github:pull:`249`). + * ``MongoDB.get_packages(scala_version=..., spark_version=..., package_version=...)`` (:github:pull:`255`). + * ``MySQL.get_packages(package_version=...)`` (:github:pull:`253`). + * ``MSSQL.get_packages(java_version=..., package_version=...)`` (:github:pull:`254`). + * ``Oracle.get_packages(java_version=..., package_version=...)`` (:github:pull:`252`). + * ``Postgres.get_packages(package_version=...)`` (:github:pull:`251`). + * ``Teradata.get_packages(package_version=...)`` (:github:pull:`256`). + + Now users can downgrade or upgrade connection without waiting for next onETL release. Previously only ``Kafka`` and ``Greenplum`` supported this feature. + +- Add ``FileFormat.parse_column(...)`` method to several classes: + * ``Avro.parse_column(col)`` (:github:pull:`265`). + * ``JSON.parse_column(col, schema=...)`` (:github:pull:`257`). + * ``CSV.parse_column(col, schema=...)`` (:github:pull:`258`). + * ``XML.parse_column(col, schema=...)`` (:github:pull:`269`). + + This allows parsing data in ``value`` field of Kafka message or string/binary column of some table as a nested Spark structure. + +- Add ``FileFormat.serialize_column(...)`` method to several classes: + * ``Avro.serialize_column(col)`` (:github:pull:`265`). + * ``JSON.serialize_column(col)`` (:github:pull:`257`). + * ``CSV.serialize_column(col)`` (:github:pull:`258`). + + This allows saving Spark nested structures or arrays to ``value`` field of Kafka message or string/binary column of some table. + +Improvements +------------ + +Few documentation improvements. + +- Replace all ``assert`` in documentation with doctest syntax. This should make documentation more readable (:github:pull:`273`). + +- Add generic ``Troubleshooting`` guide (:github:pull:`275`). + +- Improve Kafka documentation: + * Add "Prerequisites" page describing different aspects of connecting to Kafka. + * Improve "Reading from" and "Writing to" page of Kafka documentation, add more examples and usage notes. + * Add "Troubleshooting" page (:github:pull:`276`). + +- Improve Hive documentation: + * Add "Prerequisites" page describing different aspects of connecting to Hive. + * Improve "Reading from" and "Writing to" page of Hive documentation, add more examples and recommendations. + * Improve "Executing statements in Hive" page of Hive documentation. (:github:pull:`278`). + +- Add "Prerequisites" page describing different aspects of using SparkHDFS and SparkS3 connectors. (:github:pull:`279`). + +- Add note about connecting to Clickhouse cluster. (:github:pull:`280`). + + +Bug Fixes +--------- + +- Fix missing ``pysmb`` package after installing ``pip install onetl[files]`` . diff --git a/docs/changelog/0.7.0.rst b/docs/changelog/0.7.0.rst index ac2928195..385f736f7 100644 --- a/docs/changelog/0.7.0.rst +++ b/docs/changelog/0.7.0.rst @@ -114,13 +114,13 @@ Breaking Changes * Greenplum ``2.1.3`` → ``2.1.4``. * MSSQL ``10.2.1.jre8`` → ``12.2.0.jre8``. Minimal supported version of MSSQL is now 2014 instead 2021. * MySQL ``8.0.30`` → ``8.0.33``: - * * Package was renamed ``mysql:mysql-connector-java`` → ``com.mysql:mysql-connector-j``. - * * Driver class was renamed ``com.mysql.jdbc.Driver`` → ``com.mysql.cj.jdbc.Driver``. + * Package was renamed ``mysql:mysql-connector-java`` → ``com.mysql:mysql-connector-j``. + * Driver class was renamed ``com.mysql.jdbc.Driver`` → ``com.mysql.cj.jdbc.Driver``. * Oracle ``21.6.0.0.1`` → ``23.2.0.0``. * Postgres ``42.4.0`` → ``42.6.0``. * Teradata ``17.20.00.08`` → ``17.20.00.15``: - * * Package was renamed ``com.teradata.jdbc:terajdbc4`` → ``com.teradata.jdbc:terajdbc``. - * * Teradata driver is now published to Maven. + * Package was renamed ``com.teradata.jdbc:terajdbc4`` → ``com.teradata.jdbc:terajdbc``. + * Teradata driver is now published to Maven. See :github:pull:`31`. diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 715b7fdf6..62c93671d 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -3,6 +3,7 @@ :caption: Changelog DRAFT + 0.11.0 0.10.2 0.10.1 0.10.0 diff --git a/docs/changelog/next_release/+.bugfix.rst b/docs/changelog/next_release/+.bugfix.rst deleted file mode 100644 index 0b9a0db76..000000000 --- a/docs/changelog/next_release/+.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix missing ``pysmb`` package after installing ``pip install onetl[files]`` . diff --git a/docs/changelog/next_release/249.breaking.rst b/docs/changelog/next_release/249.breaking.rst deleted file mode 100644 index 1069a5324..000000000 --- a/docs/changelog/next_release/249.breaking.rst +++ /dev/null @@ -1 +0,0 @@ -Updated the Clickhouse JDBC driver from ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` to `com.clickhouse:clickhouse-jdbc:0.6.0-patch5 `_. diff --git a/docs/changelog/next_release/249.feature.rst b/docs/changelog/next_release/249.feature.rst deleted file mode 100644 index 8ec0686c2..000000000 --- a/docs/changelog/next_release/249.feature.rst +++ /dev/null @@ -1 +0,0 @@ -:class:`Clickhouse` connection supports passing custom versions: ``Clickhouse.get_packages(package_version=...)``. diff --git a/docs/changelog/next_release/251.feature.rst b/docs/changelog/next_release/251.feature.rst deleted file mode 100644 index bc8d528a2..000000000 --- a/docs/changelog/next_release/251.feature.rst +++ /dev/null @@ -1 +0,0 @@ -:class:`Postgres` connection now uses PostgreSQL JDBC driver ``42.7.3``, upgraded from ``42.6.0``, and supports passing custom versions: ``Postgres.get_packages(package_version=...)``. diff --git a/docs/changelog/next_release/252.feature.rst b/docs/changelog/next_release/252.feature.rst deleted file mode 100644 index db7c72960..000000000 --- a/docs/changelog/next_release/252.feature.rst +++ /dev/null @@ -1 +0,0 @@ -:class:`Oracle` connection now uses Oracle JDBC driver ``23.4.0.24.05``, upgraded from ``23.2.0.0``, and supports passing custom versions: ``Oracle.get_packages(java_version=..., package_version=...)``. diff --git a/docs/changelog/next_release/253.feature.rst b/docs/changelog/next_release/253.feature.rst deleted file mode 100644 index 92994bdbb..000000000 --- a/docs/changelog/next_release/253.feature.rst +++ /dev/null @@ -1 +0,0 @@ -:class:`MySQL` connection now uses MySQL JDBC driver ``8.4.0``, upgraded from ``8.0.33``, and supports passing custom versions: ``MySQL.get_packages(package_version=...)``. diff --git a/docs/changelog/next_release/254.feature.rst b/docs/changelog/next_release/254.feature.rst deleted file mode 100644 index 9b331b51d..000000000 --- a/docs/changelog/next_release/254.feature.rst +++ /dev/null @@ -1 +0,0 @@ -:class:`MSSQL` connection now uses Microsoft SQL Server JDBC driver ``12.6.2``, upgraded from ``12.2.0``, and supports passing custom versions: ``MSSQL.get_packages(java_version=..., package_version=...)``. diff --git a/docs/changelog/next_release/255.feature.rst b/docs/changelog/next_release/255.feature.rst deleted file mode 100644 index 4eae087fa..000000000 --- a/docs/changelog/next_release/255.feature.rst +++ /dev/null @@ -1 +0,0 @@ -:class:`MongoDB` connection now uses MongoDB Spark connector ``10.3.0``, upgraded from ``10.1.1``, and supports passing custom versions: ``MongoDB.get_packages(scala_version=..., package_version=...)``. diff --git a/docs/changelog/next_release/256.feature.rst b/docs/changelog/next_release/256.feature.rst deleted file mode 100644 index 752d7ffb1..000000000 --- a/docs/changelog/next_release/256.feature.rst +++ /dev/null @@ -1 +0,0 @@ -:class:`Teradata` connection now supports passing custom versions: ``Teradata.get_packages(package_version=...)``. diff --git a/docs/changelog/next_release/257.feature.rst b/docs/changelog/next_release/257.feature.rst deleted file mode 100644 index e72de4596..000000000 --- a/docs/changelog/next_release/257.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``JSON.parse_column`` and ``JSON.serialize_column`` methods to facilitate direct parsing of JSON strings into Spark DataFrame columns and serialization of structured DataFrame columns back into JSON strings. diff --git a/docs/changelog/next_release/258.feature.rst b/docs/changelog/next_release/258.feature.rst deleted file mode 100644 index e962e3156..000000000 --- a/docs/changelog/next_release/258.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``CSV.parse_column`` and ``CSV.serialize_column`` methods to facilitate direct parsing of CSV strings into Spark DataFrame CSV columns and serialization of structured DataFrame CSV columns back into CSV strings. diff --git a/docs/changelog/next_release/259.feature.rst b/docs/changelog/next_release/259.feature.rst deleted file mode 100644 index 8934a32ec..000000000 --- a/docs/changelog/next_release/259.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Update ``XML`` package from 0.17.0 to 0.18.0. diff --git a/docs/changelog/next_release/265.feature.rst b/docs/changelog/next_release/265.feature.rst deleted file mode 100644 index 03c39a942..000000000 --- a/docs/changelog/next_release/265.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``Avro.parse_column`` and ``Avro.serialize_column`` methods to enhance the handling of Avro binary data within Spark. These methods allow for direct parsing of binary Avro data into structured Spark DataFrame columns and serialization of Spark DataFrame columns back into Avro binary format. diff --git a/docs/changelog/next_release/267.breaking.rst b/docs/changelog/next_release/267.breaking.rst deleted file mode 100644 index 5ce301393..000000000 --- a/docs/changelog/next_release/267.breaking.rst +++ /dev/null @@ -1,26 +0,0 @@ -Serialize DateTimeHWM to Clickhouse's ``DateTime64(6)`` (precision up to microseconds) instead of ``DateTime`` (precision up to seconds). - -For Clickhouse below 21.1 comparing column of type ``DateTime`` with a value of type ``DateTime64`` was not supported, returning an empty dataframe. -To avoid this, replace: - -.. code:: python - - DBReader( - ..., - hwm=DBReader.AutoDetectHWM( - name="my_hwm", - expression="hwm_column", # <-- - ), - ) - -with: - -.. code:: python - - DBReader( - ..., - hwm=DBReader.AutoDetectHWM( - name="my_hwm", - expression="CAST(hwm_column AS DateTime64)", # <-- - ), - ) diff --git a/docs/changelog/next_release/268.feature.rst b/docs/changelog/next_release/268.feature.rst deleted file mode 100644 index 0938462ed..000000000 --- a/docs/changelog/next_release/268.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Allow passing JDBC connection extra params without urlencode. diff --git a/docs/changelog/next_release/269.feature.rst b/docs/changelog/next_release/269.feature.rst deleted file mode 100644 index 53bb70363..000000000 --- a/docs/changelog/next_release/269.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add ``XML.parse_column`` method for handling XML data within Spark. This method allows for direct parsing of XML strings into structured Spark DataFrame columns. diff --git a/docs/changelog/next_release/272.feature.rst b/docs/changelog/next_release/272.feature.rst deleted file mode 100644 index eb33bfd2c..000000000 --- a/docs/changelog/next_release/272.feature.rst +++ /dev/null @@ -1,47 +0,0 @@ -``ReadOptions`` and ``SQLOptions`` have been separated for JDBC connections. ``SQLOptions`` are recommended for the ``.sql`` method in JDBC connections. -``SQLOptions`` do not support ``partitioning_mode`` and require explicit definition of ``lower_bound`` and ``upper_bound`` when ``num_partitions`` is greater than 1. -``ReadOptions`` allow the inclusion of ``partitioning_mode`` and automatically handle ``lower_bound`` and ``upper_bound`` based on the data distribution - - -Before: - -.. code-block:: python - - from onetl.connection import Postgres - - postgres = Postgres(...) - df = postgres.sql( - """ - SELECT * - FROM some.mytable - WHERE key = 'something' - """, - options=Postgres.ReadOptions( - partitioning_mode="range", - partition_column="id", - num_partitions=10, - ), - ) - -After: - -.. code-block:: python - - from onetl.connection import Postgres - - postgres = Postgres(...) - df = postgres.sql( - """ - SELECT * - FROM some.mytable - WHERE key = 'something' - """, - options=Postgres.SQLOptions( - # partitioning_mode is not supported! - partition_column="id", - num_partitions=10, - # this should be set explicitly! - lower_bound=0, - upper_bound=1000, - ), - ) diff --git a/docs/changelog/next_release/273.improvement.rst b/docs/changelog/next_release/273.improvement.rst deleted file mode 100644 index 1e9650b4c..000000000 --- a/docs/changelog/next_release/273.improvement.rst +++ /dev/null @@ -1 +0,0 @@ -Replace all ``assert`` in documentation with doctest syntax. This should make documentation more readable. diff --git a/docs/changelog/next_release/274.feature.rst b/docs/changelog/next_release/274.feature.rst deleted file mode 100644 index c622e1392..000000000 --- a/docs/changelog/next_release/274.feature.rst +++ /dev/null @@ -1,37 +0,0 @@ -Divide general ``JDBCOptions`` into ``FetchOptions`` for fetching data and ``ExecuteOptions`` for executing statements. - -Before: - -.. code-block:: python - - from onetl.connection import Postgres - - postgres = Postgres(...) - df = postgres.fetch( - "SELECT * FROM some.mytable WHERE key = 'something'", - options=Postgres.JDBCOptions(fetchsize=1000, query_timeout=30), - ) - - postgres.execute( - "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", - options=Postgres.JDBCOptions(query_timeout=30), - ) - -After: - -.. code-block:: python - - from onetl.connection import Postgres - - # Using FetchOptions for fetching data - postgres = Postgres(...) - df = postgres.fetch( - "SELECT * FROM some.mytable WHERE key = 'something'", - options=Postgres.FetchOptions(fetchsize=1000), - ) - - # Using ExecuteOptions for executing statements - postgres.execute( - "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", - options=Postgres.ExecuteOptions(query_timeout=30), - ) diff --git a/docs/changelog/next_release/275.improvement.rst b/docs/changelog/next_release/275.improvement.rst deleted file mode 100644 index 5985618a0..000000000 --- a/docs/changelog/next_release/275.improvement.rst +++ /dev/null @@ -1 +0,0 @@ -Add generic ``Troubleshooting`` guide. diff --git a/docs/changelog/next_release/276.improvement.rst b/docs/changelog/next_release/276.improvement.rst deleted file mode 100644 index 4e387bd5a..000000000 --- a/docs/changelog/next_release/276.improvement.rst +++ /dev/null @@ -1,4 +0,0 @@ -Improve Kafka documentation: - * Add "Prerequisites" page describing different aspects of connecting to Kafka - * Improve "Reading from" and "Writing to" page of Kafka documentation, add more examples and usage notes. - * Add "Troubleshooting" page diff --git a/docs/changelog/next_release/278.improvement.rst b/docs/changelog/next_release/278.improvement.rst deleted file mode 100644 index bbb362142..000000000 --- a/docs/changelog/next_release/278.improvement.rst +++ /dev/null @@ -1,4 +0,0 @@ -Improve Hive documentation: - * Add "Prerequisites" page describing different aspects of connecting to Hive - * Improve "Reading from" and "Writing to" page of Hive documentation, add more examples and recommendations. - * Improve "Executing statements in Hive" page of Hive documentation. diff --git a/docs/changelog/next_release/279.improvement.rst b/docs/changelog/next_release/279.improvement.rst deleted file mode 100644 index 02653343e..000000000 --- a/docs/changelog/next_release/279.improvement.rst +++ /dev/null @@ -1 +0,0 @@ -Add "Prerequisites" page describing different aspects of using SparkHDFS and SparkS3 connectors. diff --git a/docs/changelog/next_release/280.improvement.rst b/docs/changelog/next_release/280.improvement.rst deleted file mode 100644 index 55432ef45..000000000 --- a/docs/changelog/next_release/280.improvement.rst +++ /dev/null @@ -1 +0,0 @@ -Add note about connecting to Clickhouse cluster. From 78577ed0460b96d80b484d82a8d4c7fe66db3da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 23 May 2024 15:48:01 +0000 Subject: [PATCH 68/71] [DOP-16175] Add versionadded and vesionchanged directives --- docs/hooks/index.rst | 2 + docs/plugins.rst | 4 ++ onetl/base/base_db_connection.py | 6 ++ onetl/base/base_file_connection.py | 42 ++++++++++++- onetl/base/base_file_df_connection.py | 29 +++++++-- onetl/base/base_file_filter.py | 4 ++ onetl/base/base_file_format.py | 12 ++++ onetl/base/base_file_limit.py | 8 +++ onetl/base/supports_rename_dir.py | 4 +- .../db_connection/clickhouse/connection.py | 17 ++--- .../db_connection/greenplum/connection.py | 6 ++ .../db_connection/greenplum/options.py | 2 + .../db_connection/hive/connection.py | 10 +++ onetl/connection/db_connection/hive/slots.py | 11 +++- .../jdbc_connection/connection.py | 2 + .../db_connection/jdbc_connection/options.py | 21 +++++++ .../db_connection/jdbc_mixin/connection.py | 4 ++ .../db_connection/jdbc_mixin/options.py | 9 +++ .../db_connection/kafka/connection.py | 2 + .../db_connection/kafka/kafka_auth.py | 2 + .../db_connection/kafka/kafka_basic_auth.py | 2 + .../kafka/kafka_kerberos_auth.py | 2 + .../kafka/kafka_plaintext_protocol.py | 2 + .../db_connection/kafka/kafka_protocol.py | 2 + .../db_connection/kafka/kafka_scram_auth.py | 2 + .../db_connection/kafka/kafka_ssl_protocol.py | 2 + .../connection/db_connection/kafka/options.py | 4 ++ onetl/connection/db_connection/kafka/slots.py | 14 ++++- .../db_connection/mongodb/connection.py | 10 ++- .../db_connection/mongodb/options.py | 12 +++- .../db_connection/mssql/connection.py | 2 + .../db_connection/mysql/connection.py | 6 ++ .../db_connection/oracle/connection.py | 2 + .../db_connection/postgres/connection.py | 4 ++ .../db_connection/teradata/connection.py | 6 ++ onetl/connection/file_connection/ftp.py | 2 + onetl/connection/file_connection/ftps.py | 2 + .../file_connection/hdfs/connection.py | 4 ++ .../connection/file_connection/hdfs/slots.py | 19 +++++- .../mixins/rename_dir_mixin.py | 2 + onetl/connection/file_connection/s3.py | 5 ++ onetl/connection/file_connection/sftp.py | 2 + onetl/connection/file_connection/webdav.py | 2 + .../spark_hdfs/connection.py | 4 ++ .../file_df_connection/spark_hdfs/slots.py | 19 +++++- .../file_df_connection/spark_local_fs.py | 2 + .../file_df_connection/spark_s3/connection.py | 4 ++ onetl/db/db_reader/db_reader.py | 24 +++++-- onetl/db/db_writer/db_writer.py | 10 +++ onetl/exception.py | 63 ++++++++++++++----- onetl/file/file_df_reader/file_df_reader.py | 4 ++ onetl/file/file_df_reader/options.py | 2 + onetl/file/file_downloader/file_downloader.py | 33 +++++++++- onetl/file/file_downloader/options.py | 15 ++++- onetl/file/file_downloader/result.py | 10 +-- onetl/file/file_mover/file_mover.py | 8 ++- onetl/file/file_mover/options.py | 12 +++- onetl/file/file_mover/result.py | 10 +-- onetl/file/file_uploader/file_uploader.py | 16 ++++- onetl/file/file_uploader/options.py | 15 ++++- onetl/file/file_uploader/result.py | 10 +-- onetl/file/filter/exclude_dir.py | 3 + onetl/file/filter/glob.py | 3 + onetl/file/filter/match_all_filters.py | 2 + onetl/file/filter/regexp.py | 3 + onetl/file/format/avro.py | 6 ++ onetl/file/format/csv.py | 6 ++ onetl/file/format/excel.py | 2 + onetl/file/format/json.py | 6 ++ onetl/file/format/jsonline.py | 2 + onetl/file/format/orc.py | 2 + onetl/file/format/parquet.py | 2 + onetl/file/format/xml.py | 4 ++ onetl/file/limit/limits_reached.py | 2 + onetl/file/limit/limits_stop_at.py | 2 + onetl/file/limit/max_files_count.py | 3 + onetl/hooks/hook.py | 12 ++++ onetl/hooks/hook_collection.py | 18 ++++++ onetl/hooks/hooks_state.py | 6 ++ onetl/hooks/slot.py | 11 +++- onetl/hooks/support_hooks.py | 8 +++ onetl/log.py | 13 ++++ onetl/strategy/incremental_strategy.py | 4 ++ onetl/strategy/snapshot_strategy.py | 4 ++ 84 files changed, 632 insertions(+), 60 deletions(-) diff --git a/docs/hooks/index.rst b/docs/hooks/index.rst index da56c10f8..261bb4051 100644 --- a/docs/hooks/index.rst +++ b/docs/hooks/index.rst @@ -3,6 +3,8 @@ Hooks ===== +.. versionadded:: 0.6.0 + .. toctree:: :maxdepth: 1 :caption: Hooks diff --git a/docs/plugins.rst b/docs/plugins.rst index 4c10bc762..7a429d484 100644 --- a/docs/plugins.rst +++ b/docs/plugins.rst @@ -3,6 +3,8 @@ Plugins ======= +.. versionadded:: 0.6.0 + What are plugins? ----------------- @@ -82,6 +84,8 @@ like :ref:`hook-decorator`, it will be executed during this import. How to enable/disable plugins? ------------------------------ +.. versionadded:: 0.7.0 + Disable/enable all plugins ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/onetl/base/base_db_connection.py b/onetl/base/base_db_connection.py index 670b15ec5..f9c7bcac0 100644 --- a/onetl/base/base_db_connection.py +++ b/onetl/base/base_db_connection.py @@ -132,6 +132,9 @@ def read_source_as_df( ) -> DataFrame: """ Reads the source to dataframe. |support_hooks| + + .. versionchanged:: 0.9.0 + Renamed ``read_df`` → ``read_source_as_df`` """ @abstractmethod @@ -142,4 +145,7 @@ def write_df_to_target( ) -> None: """ Saves dataframe to a specific target. |support_hooks| + + .. versionchanged:: 0.9.0 + Renamed ``write_df`` → ``write_df_to_target`` """ diff --git a/onetl/base/base_file_connection.py b/onetl/base/base_file_connection.py index 823386d55..81d57bfb6 100644 --- a/onetl/base/base_file_connection.py +++ b/onetl/base/base_file_connection.py @@ -15,13 +15,17 @@ class BaseFileConnection(BaseConnection): """ - Implements generic methods for files and directories manipulation on some filesystem (usually remote) + Implements generic methods for files and directories manipulation on some filesystem (usually remote). + + .. versionadded:: 0.8.0 """ @abstractmethod def path_exists(self, path: os.PathLike | str) -> bool: """ - Check if specified path exists on remote filesystem. |support_hooks| + Check if specified path exists on remote filesystem. |support_hooks|. + + .. versionadded:: 0.8.0 Parameters ---------- @@ -48,6 +52,8 @@ def is_file(self, path: os.PathLike | str) -> bool: """ Check if specified path is a file. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -76,6 +82,8 @@ def is_dir(self, path: os.PathLike | str) -> bool: """ Check if specified path is a directory. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -104,6 +112,8 @@ def get_stat(self, path: os.PathLike | str) -> PathStatProtocol: """ Returns stats for a specific path. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -132,6 +142,8 @@ def resolve_dir(self, path: os.PathLike | str) -> PathWithStatsProtocol: """ Returns directory at specific path, with stats. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -164,6 +176,8 @@ def resolve_file(self, path: os.PathLike | str) -> PathWithStatsProtocol: """ Returns file at specific path, with stats. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -196,6 +210,8 @@ def create_dir(self, path: os.PathLike | str) -> PathWithStatsProtocol: """ Creates directory tree on remote filesystem. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -229,6 +245,8 @@ def remove_file(self, path: os.PathLike | str) -> bool: Supports only one file removal per call. Directory removal is **NOT** supported, use :obj:`~remove_dir` instead. + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -261,6 +279,8 @@ def remove_dir(self, path: os.PathLike | str, recursive: bool = False) -> bool: If directory does not exist, no exception is raised. + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -305,6 +325,8 @@ def rename_file( Supports only one file move per call. Directory move/rename is **NOT** supported. + .. versionadded:: 0.8.0 + Parameters ---------- source_file_path : str or :obj:`os.PathLike` @@ -353,6 +375,8 @@ def list_dir( """ Return list of child files/directories in a specific directory. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -400,6 +424,8 @@ def walk( Just like :obj:`os.walk`, but with additional filter/limit logic. + .. versionadded:: 0.8.0 + Parameters ---------- root : str or :obj:`os.PathLike` @@ -458,6 +484,8 @@ def download_file( Supports only one file download per call. Directory download is **NOT** supported, use :ref:`file-downloader` instead. + .. versionadded:: 0.8.0 + Parameters ---------- remote_file_path : str or :obj:`os.PathLike` @@ -518,6 +546,8 @@ def upload_file( Supports only one file upload per call. Directory upload is **NOT** supported, use :ref:`file-uploader` instead. + .. versionadded:: 0.8.0 + Parameters ---------- local_file_path : str or :obj:`os.PathLike` @@ -569,6 +599,8 @@ def read_text(self, path: os.PathLike | str, encoding: str = "utf-8") -> str: r""" Returns string content of a file at specific path. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -601,6 +633,8 @@ def read_bytes(self, path: os.PathLike | str) -> bytes: """ Returns binary content of a file at specific path. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -639,6 +673,8 @@ def write_text( If file already exists, its content will be replaced. + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -681,6 +717,8 @@ def write_bytes(self, path: os.PathLike | str, content: bytes) -> PathWithStatsP If file already exists, its content will be replaced. + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` diff --git a/onetl/base/base_file_df_connection.py b/onetl/base/base_file_df_connection.py index c432bd5a2..c54390ce8 100644 --- a/onetl/base/base_file_df_connection.py +++ b/onetl/base/base_file_df_connection.py @@ -17,7 +17,9 @@ class FileDFReadOptions(ABC): """ - Protocol for objects supporting altering Spark DataFrameReader options + Protocol for objects supporting altering Spark DataFrameReader options. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -25,6 +27,8 @@ def apply_to_reader(self, reader: DataFrameReader) -> DataFrameReader | ContextM """ Apply provided format to :obj:`pyspark.sql.DataFrameReader`. + .. versionadded:: 0.9.0 + Returns ------- :obj:`pyspark.sql.DataFrameReader` @@ -38,7 +42,9 @@ def apply_to_reader(self, reader: DataFrameReader) -> DataFrameReader | ContextM class FileDFWriteOptions(ABC): """ - Protocol for objects supporting altering Spark DataFrameWriter options + Protocol for objects supporting altering Spark DataFrameWriter options. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -46,6 +52,8 @@ def apply_to_writer(self, writer: DataFrameWriter) -> DataFrameWriter | ContextM """ Apply provided format to :obj:`pyspark.sql.DataFrameWriter`. + .. versionadded:: 0.9.0 + Returns ------- :obj:`pyspark.sql.DataFrameWriter` @@ -59,7 +67,9 @@ def apply_to_writer(self, writer: DataFrameWriter) -> DataFrameWriter | ContextM class BaseFileDFConnection(BaseConnection): """ - Implements generic methods for reading and writing dataframe as files + Implements generic methods for reading and writing dataframe as files. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -70,6 +80,8 @@ def check_if_format_supported( """ Validate if specific file format is supported. |support_hooks| + .. versionadded:: 0.9.0 + Raises ------ RuntimeError @@ -80,12 +92,17 @@ def check_if_format_supported( def path_from_string(self, path: os.PathLike | str) -> PurePathProtocol: """ Convert path from string to object. |support_hooks| + + .. versionadded:: 0.9.0 """ @property @abstractmethod def instance_url(self) -> str: - """Instance URL""" + """Instance URL. + + .. versionadded:: 0.9.0 + """ @abstractmethod def read_files_as_df( @@ -98,6 +115,8 @@ def read_files_as_df( ) -> DataFrame: """ Read files in some paths list as dataframe. |support_hooks| + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -110,4 +129,6 @@ def write_df_as_files( ) -> None: """ Write dataframe as files in some path. |support_hooks| + + .. versionadded:: 0.9.0 """ diff --git a/onetl/base/base_file_filter.py b/onetl/base/base_file_filter.py index 5fdad2a9e..01a9893f1 100644 --- a/onetl/base/base_file_filter.py +++ b/onetl/base/base_file_filter.py @@ -15,6 +15,8 @@ class BaseFileFilter(ABC): to determine if a file should be handled or not. All filters are stateless. + + .. versionadded:: 0.8.0 """ @abstractmethod @@ -22,6 +24,8 @@ def match(self, path: PathProtocol) -> bool: """ Returns ``True`` if path is matching the filter, ``False`` otherwise + .. versionadded:: 0.8.0 + Examples -------- diff --git a/onetl/base/base_file_format.py b/onetl/base/base_file_format.py index b1f32aa07..a4c72e3e5 100644 --- a/onetl/base/base_file_format.py +++ b/onetl/base/base_file_format.py @@ -12,6 +12,8 @@ class BaseReadableFileFormat(ABC): """ Representation of readable file format. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -19,6 +21,8 @@ def check_if_supported(self, spark: SparkSession) -> None: """ Check if Spark session does support this file format. |support_hooks| + .. versionadded:: 0.9.0 + Raises ------ RuntimeError @@ -30,6 +34,8 @@ def apply_to_reader(self, reader: DataFrameReader) -> DataFrameReader | ContextM """ Apply provided format to :obj:`pyspark.sql.DataFrameReader`. |support_hooks| + .. versionadded:: 0.9.0 + Returns ------- :obj:`pyspark.sql.DataFrameReader` @@ -44,6 +50,8 @@ def apply_to_reader(self, reader: DataFrameReader) -> DataFrameReader | ContextM class BaseWritableFileFormat(ABC): """ Representation of writable file format. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -51,6 +59,8 @@ def check_if_supported(self, spark: SparkSession) -> None: """ Check if Spark session does support this file format. |support_hooks| + .. versionadded:: 0.9.0 + Raises ------ RuntimeError @@ -62,6 +72,8 @@ def apply_to_writer(self, writer: DataFrameWriter) -> DataFrameWriter | ContextM """ Apply provided format to :obj:`pyspark.sql.DataFrameWriter`. |support_hooks| + .. versionadded:: 0.9.0 + Returns ------- :obj:`pyspark.sql.DataFrameWriter` diff --git a/onetl/base/base_file_limit.py b/onetl/base/base_file_limit.py index 4793f0874..d930690d6 100644 --- a/onetl/base/base_file_limit.py +++ b/onetl/base/base_file_limit.py @@ -17,6 +17,8 @@ class BaseFileLimit(ABC): to determine if internal loop should be stopped. Unlike file filters, limits have internal state which can be updated or reset. + + .. versionadded:: 0.8.0 """ @abstractmethod @@ -24,6 +26,8 @@ def reset(self) -> Self: """ Resets the internal limit state. + .. versionadded:: 0.8.0 + Returns ------- Returns a filter of the same type, but with non-reached state. @@ -45,6 +49,8 @@ def stops_at(self, path: PathProtocol) -> bool: """ Update internal state and return current state. + .. versionadded:: 0.8.0 + Parameters ---------- path : :obj:`onetl.base.path_protocol.PathProtocol` @@ -77,6 +83,8 @@ def is_reached(self) -> bool: """ Check if limit is reached. + .. versionadded:: 0.8.0 + Returns ------- ``True`` if limit is reached, ``False`` otherwise. diff --git a/onetl/base/supports_rename_dir.py b/onetl/base/supports_rename_dir.py index 703866992..8f0d39712 100644 --- a/onetl/base/supports_rename_dir.py +++ b/onetl/base/supports_rename_dir.py @@ -12,7 +12,9 @@ @runtime_checkable class SupportsRenameDir(Protocol): """ - Protocol for objects containing ``rename_dir`` method + Protocol for objects containing ``rename_dir`` method. + + .. versionadded:: 0.8.0 """ def rename_dir( diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 3e75d768e..0097f2862 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -42,6 +42,8 @@ class Clickhouse(JDBCConnection): Before using this connector please take into account :ref:`clickhouse-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -129,13 +131,19 @@ def get_packages( """ Get package names to be downloaded by Spark. Allows specifying custom JDBC and Apache HTTP Client versions. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- package_version : str, optional - ClickHouse JDBC version client packages. Defaults to ``0.6.0-patch5``. + ClickHouse JDBC version client packages. Defaults to ``0.6.0-patch5``. + + .. versionadded:: 0.11.0 apache_http_client_version : str, optional - Apache HTTP Client version package. Defaults to ``5.3.1``. + Apache HTTP Client version package. Defaults to ``5.3.1``. + + .. versionadded:: 0.11.0 Examples -------- @@ -146,11 +154,6 @@ def get_packages( Clickhouse.get_packages(package_version="0.6.0", apache_http_client_version="5.3.1") - .. note:: - - Spark does not support ``.jar`` classifiers, so it is not possible to pass - ``com.clickhouse:clickhouse-jdbc:0.6.0:all`` to install all required packages. - """ default_jdbc_version = "0.6.0-patch5" default_http_version = "5.3.1" diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 1814e02a1..be62afa5f 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -81,6 +81,8 @@ class Greenplum(JDBCMixin, DBConnection): Before using this connector please take into account :ref:`greenplum-prerequisites` + .. versionadded:: 0.5.0 + Parameters ---------- host : str @@ -186,6 +188,8 @@ def get_packages( You should pass either ``scala_version`` or ``spark_version``. + .. versionadded:: 0.9.0 + Parameters ---------- scala_version : str, optional @@ -201,6 +205,8 @@ def get_packages( package_version : str, optional, default ``2.2.0`` Package version in format ``major.minor.patch`` + .. versionadded:: 0.10.1 + Examples -------- diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index cd3b02c41..e1cd1902a 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -304,6 +304,8 @@ class Config: * Table exists An error is raised, and no data is written to the table. + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ @root_validator(pre=True) diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 7d56df02d..fbedebefa 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -43,11 +43,15 @@ class Hive(DBConnection): Before using this connector please take into account :ref:`hive-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- cluster : str Cluster name. Used for HWM and lineage. + .. versionadded:: 0.7.0 + spark : :obj:`pyspark.sql.SparkSession` Spark session with Hive metastore support enabled @@ -117,6 +121,8 @@ def get_current(cls, spark: SparkSession): Can be used only if there are some hooks bound to :obj:`Slots.get_current_cluster ` slot. + .. versionadded:: 0.7.0 + Parameters ---------- spark : :obj:`pyspark.sql.SparkSession` @@ -183,6 +189,8 @@ def sql( Same as ``spark.sql(query)``. + .. versionadded:: 0.2.0 + Parameters ---------- query : str @@ -213,6 +221,8 @@ def execute( """ Execute DDL or DML statement. |support_hooks| + .. versionadded:: 0.2.0 + Parameters ---------- statement : str diff --git a/onetl/connection/db_connection/hive/slots.py b/onetl/connection/db_connection/hive/slots.py index ce4ceea97..3044950fd 100644 --- a/onetl/connection/db_connection/hive/slots.py +++ b/onetl/connection/db_connection/hive/slots.py @@ -7,7 +7,10 @@ @support_hooks class HiveSlots: - """:ref:`Slots ` that could be implemented by third-party plugins.""" + """:ref:`Slots ` that could be implemented by third-party plugins. + + .. versionadded:: 0.7.0 + """ @slot @staticmethod @@ -17,6 +20,8 @@ def normalize_cluster_name(cluster: str) -> str | None: If hooks didn't return anything, cluster name is left intact. + .. versionadded:: 0.7.0 + Parameters ---------- cluster : :obj:`str` @@ -53,6 +58,8 @@ def get_known_clusters() -> set[str] | None: Cluster passed into Hive constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.7.0 + Returns ------- set[str] | None @@ -84,6 +91,8 @@ def get_current_cluster() -> str | None: Used in :obj:`~check` method to verify that connection is created only from the same cluster. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.7.0 + Returns ------- str | None diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 3c66a20d7..586b63924 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -71,6 +71,8 @@ def sql( Same as ``spark.read.jdbc(query)``. + .. versionadded:: 0.2.0 + Parameters ---------- query : str diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index 433eaa9fe..12ac24193 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -115,6 +115,9 @@ class JDBCReadOptions(JDBCOptions): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.5.0 + Replace ``Connection.Options`` → ``Connection.ReadOptions`` + Examples -------- @@ -196,6 +199,9 @@ class Config: default ``fetchsize=10``, which is absolutely not usable. Thus we've overridden default value with ``100_000``, which should increase reading performance. + + .. versionchanged:: 0.2.0 + Set explicit default value to ``100_000`` """ partitioning_mode: JDBCPartitioningMode = JDBCPartitioningMode.RANGE @@ -306,6 +312,8 @@ class Config: SELECT ... FROM table WHERE (partition_column mod num_partitions) = num_partitions-1 -- upper_bound + .. versionadded:: 0.5.0 + Examples -------- @@ -383,6 +391,9 @@ class JDBCWriteOptions(JDBCOptions): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.5.0 + Replace ``Connection.Options`` → ``Connection.WriteOptions`` + Examples -------- @@ -466,6 +477,8 @@ class Config: * Table exists An error is raised, and no data is written to the table. + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ batchsize: int = 20_000 @@ -486,6 +499,9 @@ class Config: You can increase it even more, up to ``50_000``, but it depends on your database load and number of columns in the row. Higher values does not increase performance. + + .. versionchanged:: 0.4.0 + Changed default value from 1000 to 20_000 """ isolation_level: str = "READ_UNCOMMITTED" @@ -527,6 +543,8 @@ class JDBCSQLOptions(JDBCOptions): `supported by Spark `_, tailored to optimize SQL query execution. Option names should be in ``camelCase``! + .. versionadded:: 0.11.0 + Split up ``ReadOptions`` to ``SQLOptions`` """ partition_column: Optional[str] = None @@ -599,6 +617,9 @@ class JDBCSQLOptions(JDBCOptions): default ``fetchsize=10``, which is absolutely not usable. Thus we've overridden default value with ``100_000``, which should increase reading performance. + + .. versionchanged:: 0.2.0 + Set explicit default value to ``100_000`` """ class Config: diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 1ab3cb14d..f42450b6e 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -174,6 +174,8 @@ def fetch( First call of the method opens the connection to a database. Call ``.close()`` method to close it, or use context manager to do it automatically. + .. versionadded:: 0.2.0 + Parameters ---------- query : str @@ -233,6 +235,8 @@ def execute( First call of the method opens the connection to a database. Call ``.close()`` method to close it, or use context manager to do it automatically. + .. versionadded:: 0.2.0 + Parameters ---------- statement : str diff --git a/onetl/connection/db_connection/jdbc_mixin/options.py b/onetl/connection/db_connection/jdbc_mixin/options.py index 2fe94c8bb..2504c3641 100644 --- a/onetl/connection/db_connection/jdbc_mixin/options.py +++ b/onetl/connection/db_connection/jdbc_mixin/options.py @@ -33,6 +33,9 @@ class JDBCOptions(GenericOptions): You can pass any value supported by underlying JDBC driver class, even if it is not mentioned in this documentation. + + .. deprecated:: 0.11.0 + Use ``FetchOptions`` or ``ExecuteOptions`` instead """ class Config: @@ -67,6 +70,9 @@ class JDBCFetchOptions(GenericOptions): You can pass any value supported by underlying JDBC driver class, even if it is not mentioned in this documentation. + + .. versionadded:: 0.11.0 + Replace ``Connection.JDBCOptions`` → ``Connection.FetchOptions`` """ class Config: @@ -100,6 +106,9 @@ class JDBCExecuteOptions(GenericOptions): You can pass any value supported by underlying JDBC driver class, even if it is not mentioned in this documentation. + + .. versionadded:: 0.11.0 + Replace ``Connection.JDBCOptions`` → ``Connection.ExecuteOptions`` """ class Config: diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 08a8fbe65..b64fff143 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -65,6 +65,8 @@ class Kafka(DBConnection): This connector is for **batch** ETL processes, not streaming. + .. versionadded:: 0.9.0 + Parameters ---------- diff --git a/onetl/connection/db_connection/kafka/kafka_auth.py b/onetl/connection/db_connection/kafka/kafka_auth.py index f39d39a3e..f1bbdf101 100644 --- a/onetl/connection/db_connection/kafka/kafka_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_auth.py @@ -12,6 +12,8 @@ class KafkaAuth(ABC): """ Interface for Kafka connection Auth classes. + + .. versionadded:: 0.9.0 """ @abstractmethod diff --git a/onetl/connection/db_connection/kafka/kafka_basic_auth.py b/onetl/connection/db_connection/kafka/kafka_basic_auth.py index e364b4123..4038dd02a 100644 --- a/onetl/connection/db_connection/kafka/kafka_basic_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_basic_auth.py @@ -22,6 +22,8 @@ class KafkaBasicAuth(KafkaAuth, GenericOptions): For more details see `Kafka Documentation `_. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py index 27f889ed1..6a20a31af 100644 --- a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py @@ -59,6 +59,8 @@ class KafkaKerberosAuth(KafkaAuth, GenericOptions): * `Kafka Documentation `_ * `Krb5LoginModule documentation `_ + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py b/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py index fd1384461..2dd3a6a97 100644 --- a/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py @@ -21,6 +21,8 @@ class KafkaPlaintextProtocol(KafkaProtocol, FrozenModel): Not recommended to use on production environments. Prefer :obj:`SSLProtocol `. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/kafka_protocol.py b/onetl/connection/db_connection/kafka/kafka_protocol.py index 12de89cf8..5d2a328c8 100644 --- a/onetl/connection/db_connection/kafka/kafka_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_protocol.py @@ -12,6 +12,8 @@ class KafkaProtocol(ABC): """ Interface for Kafka connection Protocol classes. + + .. versionadded:: 0.9.0 """ @abstractmethod diff --git a/onetl/connection/db_connection/kafka/kafka_scram_auth.py b/onetl/connection/db_connection/kafka/kafka_scram_auth.py index ddd9280fa..add09f349 100644 --- a/onetl/connection/db_connection/kafka/kafka_scram_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_scram_auth.py @@ -25,6 +25,8 @@ class KafkaScramAuth(KafkaAuth, GenericOptions): For more details see `Kafka Documentation `_. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py index 96e9d89b2..6149f5aa0 100644 --- a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py @@ -30,6 +30,8 @@ class KafkaSSLProtocol(KafkaProtocol, GenericOptions): * `IBM Documentation `_ * `How to use PEM Certificates with Kafka `_ + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/options.py b/onetl/connection/db_connection/kafka/options.py index 0b04f2cb5..e2a4a8d32 100644 --- a/onetl/connection/db_connection/kafka/options.py +++ b/onetl/connection/db_connection/kafka/options.py @@ -76,6 +76,8 @@ class KafkaReadOptions(GenericOptions): are populated from connection attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. + .. versionadded:: 0.9.0 + Examples -------- @@ -121,6 +123,8 @@ class KafkaWriteOptions(GenericOptions): are populated from connection attributes, and cannot be overridden by the user in ``WriteOptions`` to avoid issues. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/slots.py b/onetl/connection/db_connection/kafka/slots.py index cd6bfcbe8..2abf00ce0 100644 --- a/onetl/connection/db_connection/kafka/slots.py +++ b/onetl/connection/db_connection/kafka/slots.py @@ -7,7 +7,11 @@ @support_hooks class KafkaSlots: - """Kafka slots that could be implemented by third-party plugins""" + """ + Kafka slots that could be implemented by third-party plugins + + .. versionadded:: 0.9.0 + """ @slot @staticmethod @@ -17,6 +21,8 @@ def normalize_cluster_name(cluster: str) -> str | None: This can be used to ensure that the Kafka cluster name conforms to specific naming conventions. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : str @@ -50,6 +56,8 @@ def get_known_clusters() -> set[str] | None: This can be used to validate if the provided Kafka cluster name is recognized in the system. + .. versionadded:: 0.9.0 + Returns ------- set[str] | None @@ -78,6 +86,8 @@ def normalize_address(address: str, cluster: str) -> str | None: This can be used to format the broker address according to specific rules, such as adding default ports. + .. versionadded:: 0.9.0 + Parameters ---------- address : str @@ -115,6 +125,8 @@ def get_cluster_addresses(cluster: str) -> list[str] | None: This can be used to obtain the broker addresses dynamically. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : str diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index e4fa55b1c..568cd9537 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -57,6 +57,8 @@ class MongoDB(DBConnection): Before using this connector please take into account :ref:`mongodb-prerequisites` + .. versionadded:: 0.7.0 + Parameters ---------- host : str @@ -138,6 +140,8 @@ def get_packages( """ Get package names to be downloaded by Spark. Allows specifying custom MongoDB Spark connector versions. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- scala_version : str, optional @@ -151,6 +155,8 @@ def get_packages( package_version : str, optional Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.3.0``. + .. versionadded:: 0.11.0 + Examples -------- .. code:: python @@ -237,6 +243,8 @@ def pipeline( This method does not support :ref:`strategy`, use :obj:`DBReader ` instead + .. versionadded:: 0.7.0 + Parameters ---------- @@ -251,7 +259,7 @@ def pipeline( Schema describing the resulting DataFrame. options : PipelineOptions | dict, optional - Additional pipeline options, see :obj:`~PipelineOptions`. + Additional pipeline options, see :obj:`MongoDB.PipelineOptions `. Examples -------- diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 2ddc7a068..223e05ec4 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -94,7 +94,7 @@ def _missing_(cls, value: object): # noqa: WPS120 class MongoDBPipelineOptions(GenericOptions): """Aggregation pipeline options for MongoDB connector. - The only difference from :obj:`MongoDBReadOptions` that it is allowed to pass the ``hint`` parameter. + The only difference from :obj:`MongoDB.ReadOptions ` that it is allowed to pass the ``hint`` parameter. .. note :: @@ -109,6 +109,8 @@ class MongoDBPipelineOptions(GenericOptions): Options ``uri``, ``database``, ``collection``, ``pipeline`` are populated from connection attributes, and cannot be overridden by the user in ``PipelineOptions`` to avoid issues. + .. versionadded:: 0.7.0 + Examples -------- @@ -143,6 +145,8 @@ class MongoDBReadOptions(GenericOptions): Options ``uri``, ``database``, ``collection``, ``pipeline``, ``hint`` are populated from connection attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. + .. versionadded:: 0.7.0 + Examples -------- @@ -151,7 +155,7 @@ class MongoDBReadOptions(GenericOptions): .. code:: python MongoDB.ReadOptions( - batchSize=10000, + sampleSize=100, ) """ @@ -177,6 +181,8 @@ class MongoDBWriteOptions(GenericOptions): Options ``uri``, ``database``, ``collection`` are populated from connection attributes, and cannot be overridden by the user in ``WriteOptions`` to avoid issues. + .. versionadded:: 0.7.0 + Examples -------- @@ -246,6 +252,8 @@ class MongoDBWriteOptions(GenericOptions): * Collection exists An error is raised, and no data is written to the collection. + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ class Config: diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 5fd50aa0f..2ef1f1f69 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -168,6 +168,8 @@ def get_packages( """ Get package names to be downloaded by Spark. Allows specifying custom JDBC driver versions for MSSQL. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- java_version : str, optional diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index ff21a3be2..4e37d1d52 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -41,6 +41,8 @@ class MySQL(JDBCConnection): Before using this connector please take into account :ref:`mysql-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -122,11 +124,15 @@ def get_packages(cls, package_version: str | None = None) -> list[str]: """ Get package names to be downloaded by Spark. Allows specifying a custom JDBC driver version for MySQL. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- package_version : str, optional Specifies the version of the MySQL JDBC driver to use. Defaults to ``8.4.0``. + .. versionadded:: 0.11.0 + Examples -------- .. code:: python diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index f07fa8c95..fd0f9c1f7 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -88,6 +88,8 @@ class Oracle(JDBCConnection): Before using this connector please take into account :ref:`oracle-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 3bcac2eb9..44a36eb6e 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -50,6 +50,8 @@ class Postgres(JDBCConnection): Before using this connector please take into account :ref:`postgres-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -131,6 +133,8 @@ def get_packages(cls, package_version: str | None = None) -> list[str]: """ Get package names to be downloaded by Spark. Allows specifying a custom JDBC driver version. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- package_version : str, optional diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index fcffe1772..3ca789a13 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -45,6 +45,8 @@ class Teradata(JDBCConnection): Before using this connector please take into account :ref:`teradata-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -146,11 +148,15 @@ def get_packages( """ Get package names to be downloaded by Spark. Allows specifying custom JDBC driver versions for Teradata. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- package_version : str, optional Specifies the version of the Teradata JDBC driver to use. Defaults to ``17.20.00.15``. + .. versionadded:: 0.11.0 + Examples -------- .. code:: python diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index 07b8b53e1..b457b966f 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -62,6 +62,8 @@ class FTP(FileConnection, RenameDirMixin): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.1.0 + Parameters ---------- host : str diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 9277b66fd..8cf9aa8fc 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -59,6 +59,8 @@ class FTPS(FTP): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.1.0 + Parameters ---------- host : str diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 7105763fe..056622fbe 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -88,6 +88,8 @@ class HDFS(FileConnection, RenameDirMixin): You should pass at least one of these arguments: ``cluster``, ``host``. + .. versionadded:: 0.7.0 + host : str, optional Hadoop namenode host. For example: ``namenode1.domain.com``. @@ -223,6 +225,8 @@ def get_current(cls, **kwargs): Can be used only if there are a some hooks bound to slot :obj:`Slots.get_current_cluster ` + .. versionadded:: 0.7.0 + Parameters ---------- user : str diff --git a/onetl/connection/file_connection/hdfs/slots.py b/onetl/connection/file_connection/hdfs/slots.py index 8bd8431a1..2f75fefad 100644 --- a/onetl/connection/file_connection/hdfs/slots.py +++ b/onetl/connection/file_connection/hdfs/slots.py @@ -7,7 +7,10 @@ @support_hooks class HDFSSlots: - """Slots that could be implemented by third-party plugins""" + """Slots that could be implemented by third-party plugins. + + .. versionadded:: 0.7.0 + """ @slot @staticmethod @@ -17,6 +20,8 @@ def normalize_cluster_name(cluster: str) -> str | None: If hooks didn't return anything, cluster name is left intact. + .. versionadded:: 0.7.0 + Parameters ---------- cluster : :obj:`str` @@ -52,6 +57,8 @@ def normalize_namenode_host(host: str, cluster: str | None) -> str | None: If hooks didn't return anything, host is left intact. + .. versionadded:: 0.7.0 + Parameters ---------- host : :obj:`str` @@ -97,6 +104,8 @@ def get_known_clusters() -> set[str] | None: Cluster passed into HDFS constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.7.0 + Returns ------- set[str] | None @@ -128,6 +137,8 @@ def get_cluster_namenodes(cluster: str) -> set[str] | None: Namenode host passed into HDFS constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.7.0 + Parameters ---------- cluster : :obj:`str` @@ -166,6 +177,8 @@ def get_current_cluster() -> str | None: Used in :obj:`~get_current_cluster` to automatically fill up ``cluster`` attribute of a connection. If hooks didn't return anything, calling the method above will raise an exception. + .. versionadded:: 0.7.0 + Returns ------- str | None @@ -197,6 +210,8 @@ def get_webhdfs_port(cluster: str) -> int | None: Used by constructor to automatically set port number if omitted. + .. versionadded:: 0.7.0 + Parameters ---------- cluster : :obj:`str` @@ -242,6 +257,8 @@ def is_namenode_active(host: str, cluster: str | None) -> bool | None: :obj:`~check` will determine whether this host is active. + .. versionadded:: 0.7.0 + Parameters ---------- host : :obj:`str` diff --git a/onetl/connection/file_connection/mixins/rename_dir_mixin.py b/onetl/connection/file_connection/mixins/rename_dir_mixin.py index 2684b0351..c110745c1 100644 --- a/onetl/connection/file_connection/mixins/rename_dir_mixin.py +++ b/onetl/connection/file_connection/mixins/rename_dir_mixin.py @@ -23,6 +23,8 @@ def rename_dir( """ Rename or move dir on remote filesystem. + .. versionadded:: 0.8.0 + Parameters ---------- source_dir_path : str or :obj:`os.PathLike` diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 7811c4c4b..f8f584dcc 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -62,6 +62,8 @@ class S3(FileConnection): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.5.1 + Parameters ---------- host : str @@ -82,6 +84,9 @@ class S3(FileConnection): protocol : str, default : ``https`` Connection protocol. Allowed values: ``https`` or ``http`` + .. versionchanged:: 0.6.0 + Renamed ``secure: bool`` to ``protocol: Literal["https", "http"]`` + session_token : str, optional Session token of your account in S3 service diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 3d64669b1..8cd2ac1ed 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -65,6 +65,8 @@ class SFTP(FileConnection, RenameDirMixin): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.1.0 + Parameters ---------- host : str diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index f74eed3e3..aa540567e 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -65,6 +65,8 @@ class WebDAV(FileConnection, RenameDirMixin): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.6.0 + Parameters ---------- host : str diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 604e70984..26c1416eb 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -48,6 +48,8 @@ class SparkHDFS(SparkFileDFConnection): Does NOT support file operations, like create, delete, rename, etc. For these operations, use :obj:`HDFS ` connection. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : str @@ -222,6 +224,8 @@ def get_current(cls, spark: SparkSession): Can be used only if there are a some hooks bound to :obj:`Slots.get_current_cluster `. + .. versionadded:: 0.9.0 + Parameters ---------- spark : SparkSession diff --git a/onetl/connection/file_df_connection/spark_hdfs/slots.py b/onetl/connection/file_df_connection/spark_hdfs/slots.py index d16c6527e..4dab6b548 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/slots.py +++ b/onetl/connection/file_df_connection/spark_hdfs/slots.py @@ -7,7 +7,10 @@ @support_hooks class SparkHDFSSlots: - """Spark HDFS slots that could be implemented by third-party plugins""" + """Spark HDFS slots that could be implemented by third-party plugins. + + .. versionadded:: 0.9.0 + """ @slot @staticmethod @@ -17,6 +20,8 @@ def normalize_cluster_name(cluster: str) -> str | None: If hooks didn't return anything, cluster name is left intact. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : :obj:`str` @@ -52,6 +57,8 @@ def normalize_namenode_host(host: str, cluster: str) -> str | None: If hooks didn't return anything, host is left intact. + .. versionadded:: 0.9.0 + Parameters ---------- host : :obj:`str` @@ -97,6 +104,8 @@ def get_known_clusters() -> set[str] | None: Cluster passed into SparkHDFS constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.9.0 + Returns ------- set[str] | None @@ -128,6 +137,8 @@ def get_cluster_namenodes(cluster: str) -> set[str] | None: Namenode host passed into SparkHDFS constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : :obj:`str` @@ -166,6 +177,8 @@ def get_current_cluster() -> str | None: Used in :obj:`~get_current_cluster` to automatically fill up ``cluster`` attribute of a connection. If hooks didn't return anything, calling the method above will raise an exception. + .. versionadded:: 0.9.0 + Returns ------- str | None @@ -197,6 +210,8 @@ def get_ipc_port(cluster: str) -> int | None: Used by constructor to automatically set port number if omitted. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : :obj:`str` @@ -242,6 +257,8 @@ def is_namenode_active(host: str, cluster: str) -> bool | None: :obj:`~check` will determine whether this host is active. + .. versionadded:: 0.9.0 + Parameters ---------- host : :obj:`str` diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index f65ceaabd..839cbdaec 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -43,6 +43,8 @@ class SparkLocalFS(SparkFileDFConnection): Does NOT support file operations, like create, delete, rename, etc. + .. versionadded:: 0.9.0 + Parameters ---------- spark : :class:`pyspark.sql.SparkSession` diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index b71a9ad12..04da89e0e 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -64,6 +64,8 @@ class SparkS3(SparkFileDFConnection): Does NOT support file operations, like create, delete, rename, etc. For these operations, use :obj:`S3 ` connection. + .. versionadded:: 0.9.0 + Parameters ---------- host : str @@ -216,6 +218,8 @@ def get_packages( """ Get package names to be downloaded by Spark. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- spark_version : str diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index dff63de7c..91b3f21b1 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -61,6 +61,11 @@ class DBReader(FrozenModel): This class operates with only one source at a time. It does NOT support executing queries to multiple source, like ``SELECT ... JOIN``. + .. versionadded:: 0.1.0 + + .. versionchanged:: 0.8.0 + Moved ``onetl.core.DBReader`` → ``onetl.db.DBReader`` + Parameters ---------- connection : :obj:`onetl.connection.BaseDBConnection` @@ -72,6 +77,9 @@ class DBReader(FrozenModel): If connection has schema support, you need to specify the full name of the source including the schema, e.g. ``schema.name``. + .. versionchanged:: 0.7.0 + Renamed ``table`` → ``source`` + columns : list of str, default: None The list of columns to be read. @@ -148,6 +156,9 @@ class DBReader(FrozenModel): Some sources does not support passing expressions and can be used only with column/field names which present in the source. + .. versionchanged:: 0.10.0 + Replaces deprecated ``hwm_column`` and ``hwm_expression`` attributes + hint : Any, default: ``None`` Hint expression used for querying the data. @@ -506,12 +517,13 @@ def has_data(self) -> bool: .. warning:: - If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + + .. versionadded:: 0.10.0 Raises ------ RuntimeError - Current strategy is not compatible with HWM parameter. Examples @@ -563,14 +575,14 @@ def raise_if_no_data(self) -> None: If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + .. versionadded:: 0.10.0 + Raises ------ - RuntimeError - + RuntimeError Current strategy is not compatible with HWM parameter. :obj:`onetl.exception.NoDataError` - There is no data in source. Examples @@ -600,6 +612,8 @@ def run(self) -> DataFrame: If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + .. versionadded:: 0.1.0 + Returns ------- df : pyspark.sql.dataframe.DataFrame diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index 81bc070d6..666fce87e 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -30,6 +30,11 @@ class DBWriter(FrozenModel): """Class specifies schema and table where you can write your dataframe. |support_hooks| + .. versionadded:: 0.1.0 + + .. versionchanged:: 0.8.0 + Moved ``onetl.core.DBReader`` → ``onetl.db.DBReader`` + Parameters ---------- connection : :obj:`onetl.connection.DBConnection` @@ -41,6 +46,9 @@ class DBWriter(FrozenModel): If connection has schema support, you need to specify the full name of the source including the schema, e.g. ``schema.name``. + .. versionchanged:: 0.7.0 + Renamed ``table`` → ``target`` + options : dict, :obj:`onetl.connection.DBConnection.WriteOptions`, default: ``None`` Spark write options. Can be in form of special ``WriteOptions`` object or a dict. @@ -170,6 +178,8 @@ def run(self, df: DataFrame): .. note :: Method does support only **batching** DataFrames. + .. versionadded:: 0.1.0 + Parameters ---------- df : pyspark.sql.dataframe.DataFrame diff --git a/onetl/exception.py b/onetl/exception.py index 45ea80020..03650e9a1 100644 --- a/onetl/exception.py +++ b/onetl/exception.py @@ -23,7 +23,9 @@ class DirectoryNotFoundError(OSError): Like ``FileNotFoundError``, but for directory. Cannot be replaced with ``NotAFileError`` because on some operating systems - (e.g. Linux) there are other file types than regular file and directory - symlink, device, etc + (e.g. Linux) there are other file types than regular file and directory - symlink, device, etc. + + .. versionadded:: 0.3.0 """ @@ -31,87 +33,118 @@ class NotAFileError(OSError): """ Like ``NotADirectoryError``, but for files. - Cannot be replaced with ``FileNotFoundError``, it has different meaning + Cannot be replaced with ``FileNotFoundError``, it has different meaning. + + .. versionadded:: 0.3.0 """ class FileSizeMismatchError(OSError): """ - File size mismatch + File size mismatch. + + .. versionadded:: 0.8.0 """ class DirectoryExistsError(OSError): """ Like ``FileExistsError``, but for directories. + + .. versionadded:: 0.8.0 """ class DirectoryNotEmptyError(OSError): """ - Raised when trying to remove directory contains some files or other directories + Raised when trying to remove directory contains some files or other directories.. + + .. versionadded:: 0.3.0 """ class NoDataError(NeedEvacuation): """ - Raised when there is no data in FileResult or DataFrame + Raised when there is no data in FileResult or DataFrame. + + .. versionadded:: 0.4.0 """ class FilesError(RuntimeError): """ - Raised when something went wrong while working with file collection + Raised when something went wrong while working with file collection. + + .. versionadded:: 0.4.0 """ class SkippedFilesError(FilesError): """ - Raised when file collection contains skipped files + Raised when file collection contains skipped files. + + .. versionadded:: 0.4.0 """ class FailedFilesError(FilesError): """ - Raised when file collection contains failed files + Raised when file collection contains failed files. + + .. versionadded:: 0.4.0 """ class MissingFilesError(FilesError): """ - Raised when file collection contains missing files + Raised when file collection contains missing files. + + .. versionadded:: 0.4.0 """ class ZeroFileSizeError(FilesError): """ - Raised when file collection contains some zero-sized file + Raised when file collection contains some zero-sized file. + + .. versionadded:: 0.4.0 """ class EmptyFilesError(FilesError, NoDataError): """ - Raised when file collection is empty + Raised when file collection is empty. + + .. versionadded:: 0.4.0 """ class SparkError(RuntimeError): """ - Raised when something went wrong while working with Spark + Raised when something went wrong while working with Spark. + + .. versionadded:: 0.5.0 """ class TooManyParallelJobsError(SparkError): """ - Raised when number parallel jobs is too high + Raised when number parallel jobs is too high. + + .. versionadded:: 0.5.0 """ class SignatureError(TypeError): """ - Raised when hook signature is not consistent with slot + Raised when hook signature is not consistent with slot. + + .. versionadded:: 0.7.0 """ class TargetAlreadyExistsError(Exception): - """Raised if the target already exists in source""" + """Raised if the target already exists in source. + + .. versionadded:: 0.9.0 + """ diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index 6967387a6..b18fc1792 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -43,6 +43,8 @@ class FileDFReader(FrozenModel): This class does **not** support read strategies. + .. versionadded:: 0.9.0 + Parameters ---------- connection : :obj:`BaseFileDFConnection ` @@ -115,6 +117,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DataFrame: """ Method for reading files as DataFrame. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- diff --git a/onetl/file/file_df_reader/options.py b/onetl/file/file_df_reader/options.py index 7fdbe64d9..714cf1a9d 100644 --- a/onetl/file/file_df_reader/options.py +++ b/onetl/file/file_df_reader/options.py @@ -32,6 +32,8 @@ class FileDFReaderOptions(FileDFReadOptions, GenericOptions): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- Created reader options diff --git a/onetl/file/file_downloader/file_downloader.py b/onetl/file/file_downloader/file_downloader.py index f429838df..3fe45ff40 100644 --- a/onetl/file/file_downloader/file_downloader.py +++ b/onetl/file/file_downloader/file_downloader.py @@ -80,6 +80,11 @@ class FileDownloader(FrozenModel): It does NOT support direct file transfer between filesystems, like ``FTP -> SFTP``. You should use FileDownloader + :ref:`file-uploader` to implement ``FTP -> local dir -> SFTP``. + .. versionadded:: 0.1.0 + + .. versionchanged:: 0.8.0 + Moved ``onetl.core.FileDownloader`` → ``onetl.file.FileDownloader`` + Parameters ---------- connection : :obj:`onetl.connection.FileConnection` @@ -112,15 +117,30 @@ class FileDownloader(FrozenModel): Otherwise instead of ``rename``, remote OS will move file between filesystems, which is NOT atomic operation. + .. versionadded:: 0.5.0 + filters : list of :obj:`BaseFileFilter ` Return only files/directories matching these filters. See :ref:`file-filters` + .. versionchanged:: 0.3.0 + Replaces old ``source_path_pattern: str`` and ``exclude_dirs: str`` options. + + .. versionchanged:: 0.8.0 + Renamed ``filter`` → ``filters`` + limits : list of :obj:`BaseFileLimit ` Apply limits to the list of files/directories, and stop if one of the limits is reached. See :ref:`file-limits` + .. versionadded:: 0.4.0 + + .. versionchanged:: 0.8.0 + Renamed ``limit`` → ``limits`` + options : :obj:`~FileDownloader.Options` | dict | None, default: ``None`` - File downloading options. See :obj:`~FileDownloader.Options` + File downloading options. See :obj:`FileDownloader.Options ` + + .. versionadded:: 0.3.0 hwm : type[HWM] | None, default: ``None`` @@ -129,6 +149,11 @@ class FileDownloader(FrozenModel): .. warning :: Used only in :obj:`IncrementalStrategy `. + .. versionadded:: 0.5.0 + + .. versionchanged:: 0.10.0 + Replaces deprecated ``hwm_type`` attribute + Examples -------- Simple Downloader creation @@ -235,6 +260,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DownloadResul This method can return different results depending on :ref:`strategy` + .. versionadded:: 0.1.0 + Parameters ---------- @@ -247,6 +274,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DownloadResul If not, download to ``local_path`` **all** input files, **ignoring** filters, limits and HWM. + .. versionadded:: 0.3.0 + Returns ------- :obj:`DownloadResult ` @@ -399,6 +428,8 @@ def view_files(self) -> FileSet[RemoteFile]: This method can return different results depending on :ref:`strategy` + .. versionadded:: 0.3.0 + Raises ------ :obj:`onetl.exception.DirectoryNotFoundError` diff --git a/onetl/file/file_downloader/options.py b/onetl/file/file_downloader/options.py index 07cc2abc3..91dd44d91 100644 --- a/onetl/file/file_downloader/options.py +++ b/onetl/file/file_downloader/options.py @@ -13,7 +13,10 @@ class FileDownloaderOptions(GenericOptions): - """File downloading options""" + """File downloading options. + + .. versionadded:: 0.3.0 + """ if_exists: FileExistBehavior = Field(default=FileExistBehavior.ERROR, alias="mode") """ @@ -24,6 +27,9 @@ class FileDownloaderOptions(GenericOptions): * ``ignore`` - do nothing, mark file as ignored * ``replace_file`` - replace existing file with a new one * ``replace_entire_directory`` - delete local directory content before downloading files + + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ delete_source: bool = False @@ -31,6 +37,11 @@ class FileDownloaderOptions(GenericOptions): If ``True``, remove source file after successful download. If download failed, file will left intact. + + .. versionadded:: 0.2.0 + + .. versionchanged:: 0.3.0 + Move ``FileUploader.delete_local`` to ``FileUploaderOptions`` """ workers: int = Field(default=1, ge=1) @@ -41,6 +52,8 @@ class FileDownloaderOptions(GenericOptions): 2 or more means files will be downloaded in parallel workers. Recommended value is ``min(32, os.cpu_count() + 4)``, e.g. ``5``. + + .. versionadded:: 0.8.1 """ @root_validator(pre=True) diff --git a/onetl/file/file_downloader/result.py b/onetl/file/file_downloader/result.py index 0fa3090e1..96d184e9e 100644 --- a/onetl/file/file_downloader/result.py +++ b/onetl/file/file_downloader/result.py @@ -17,10 +17,12 @@ class DownloadResult(FileResult): Container for file paths, divided into certain categories: - * :obj:`successful` - * :obj:`failed` - * :obj:`skipped` - * :obj:`missing` + * :obj:`~successful` + * :obj:`~failed` + * :obj:`~skipped` + * :obj:`~missing` + + .. versionadded:: 0.3.0 Examples -------- diff --git a/onetl/file/file_mover/file_mover.py b/onetl/file/file_mover/file_mover.py index b55f048d8..0bb2e6669 100644 --- a/onetl/file/file_mover/file_mover.py +++ b/onetl/file/file_mover/file_mover.py @@ -67,6 +67,8 @@ class FileMover(FrozenModel): This class does **not** support read strategies. + .. versionadded:: 0.8.0 + Parameters ---------- connection : :obj:`onetl.connection.FileConnection` @@ -89,7 +91,7 @@ class FileMover(FrozenModel): See :ref:`file-limits` options : :obj:`~FileMover.Options` | dict | None, default: ``None`` - File moving options. See :obj:`~FileMover.Options` + File moving options. See :obj:`FileMover.Options ` Examples -------- @@ -163,6 +165,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> MoveResult: """ Method for moving files from source to target directory. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- @@ -309,6 +313,8 @@ def view_files(self) -> FileSet[RemoteFile]: Get file list in the ``source_path``, after ``filter`` and ``limit`` applied (if any). |support_hooks| + .. versionadded:: 0.8.0 + Raises ------ :obj:`onetl.exception.DirectoryNotFoundError` diff --git a/onetl/file/file_mover/options.py b/onetl/file/file_mover/options.py index b68326198..ce9c12b24 100644 --- a/onetl/file/file_mover/options.py +++ b/onetl/file/file_mover/options.py @@ -13,7 +13,10 @@ class FileMoverOptions(GenericOptions): - """File moving options""" + """File moving options. + + .. versionadded:: 0.8.0 + """ if_exists: FileExistBehavior = Field(default=FileExistBehavior.ERROR, alias="mode") """ @@ -24,6 +27,11 @@ class FileMoverOptions(GenericOptions): * ``ignore`` - do nothing, mark file as ignored * ``replace_file`` - replace existing file with a new one * ``replace_entire_directory`` - delete directory content before moving files + + .. versionadded:: 0.8.0 + + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ workers: int = Field(default=1, ge=1) @@ -34,6 +42,8 @@ class FileMoverOptions(GenericOptions): 2 or more means files will be moved in parallel workers. Recommended value is ``min(32, os.cpu_count() + 4)``, e.g. ``5``. + + .. versionadded:: 0.8.1 """ @root_validator(pre=True) diff --git a/onetl/file/file_mover/result.py b/onetl/file/file_mover/result.py index 4c2456999..99313d0f7 100644 --- a/onetl/file/file_mover/result.py +++ b/onetl/file/file_mover/result.py @@ -17,10 +17,12 @@ class MoveResult(FileResult): Container for file paths, divided into certain categories: - * :obj:`successful` - * :obj:`failed` - * :obj:`skipped` - * :obj:`missing` + * :obj:`~successful` + * :obj:`~failed` + * :obj:`~skipped` + * :obj:`~missing` + + .. versionadded:: 0.8.0 Examples -------- diff --git a/onetl/file/file_uploader/file_uploader.py b/onetl/file/file_uploader/file_uploader.py index 819d357f5..9ab5f088f 100644 --- a/onetl/file/file_uploader/file_uploader.py +++ b/onetl/file/file_uploader/file_uploader.py @@ -63,6 +63,11 @@ class FileUploader(FrozenModel): This class does **not** support read strategies. + .. versionadded:: 0.1.0 + + .. versionchanged:: 0.8.0 + Moved ``onetl.core.FileDownloader`` → ``onetl.file.FileDownloader`` + Parameters ---------- connection : :obj:`onetl.connection.FileConnection` @@ -77,6 +82,8 @@ class FileUploader(FrozenModel): Could be ``None``, but only if you pass absolute file paths directly to :obj:`~run` method + .. versionadded:: 0.3.0 + temp_path : os.PathLike or str, optional, default: ``None`` If set, this path will be used for uploading a file, and then renaming it to the target file path. If ``None`` (default since v0.5.0) is passed, files are uploaded directly to ``target_path``. @@ -95,8 +102,11 @@ class FileUploader(FrozenModel): Otherwise instead of ``rename``, remote OS will move file between filesystems, which is NOT atomic operation. + .. versionchanged:: 0.5.0 + Default changed from ``/tmp`` to ``None`` + options : :obj:`~FileUploader.Options` | dict | None, default: ``None`` - File upload options. See :obj:`~FileUploader.Options` + File upload options. See :obj:`FileUploader.Options ` Examples -------- @@ -151,6 +161,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> UploadResult: """ Method for uploading files to remote host. |support_hooks| + .. versionadded:: 0.1.0 + Parameters ---------- @@ -305,6 +317,8 @@ def view_files(self) -> FileSet[LocalPath]: """ Get file list in the ``local_path``. |support_hooks| + .. versionadded:: 0.3.0 + Raises ------ :obj:`onetl.exception.DirectoryNotFoundError` diff --git a/onetl/file/file_uploader/options.py b/onetl/file/file_uploader/options.py index 9789d8477..b046ec3a8 100644 --- a/onetl/file/file_uploader/options.py +++ b/onetl/file/file_uploader/options.py @@ -13,7 +13,10 @@ class FileUploaderOptions(GenericOptions): - """File uploading options""" + """File uploading options. + + .. versionadded:: 0.3.0 + """ if_exists: FileExistBehavior = Field(default=FileExistBehavior.ERROR, alias="mode") """ @@ -24,6 +27,9 @@ class FileUploaderOptions(GenericOptions): * ``ignore`` - do nothing, mark file as ignored * ``replace_file`` - replace existing file with a new one * ``replace_entire_directory`` - delete local directory content before downloading files + + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ delete_local: bool = False @@ -31,6 +37,11 @@ class FileUploaderOptions(GenericOptions): If ``True``, remove local file after successful download. If download failed, file will left intact. + + .. versionadded:: 0.2.0 + + .. versionchanged:: 0.3.0 + Move ``FileUploader.delete_local`` to ``FileUploaderOptions`` """ workers: int = Field(default=1, ge=1) @@ -41,6 +52,8 @@ class FileUploaderOptions(GenericOptions): 2 or more means files will be uploaded in parallel workers. Recommended value is ``min(32, os.cpu_count() + 4)``, e.g. ``5``. + + .. versionadded:: 0.8.1 """ @root_validator(pre=True) diff --git a/onetl/file/file_uploader/result.py b/onetl/file/file_uploader/result.py index e3b068190..34638bae5 100644 --- a/onetl/file/file_uploader/result.py +++ b/onetl/file/file_uploader/result.py @@ -17,10 +17,12 @@ class UploadResult(FileResult): Container for file paths, divided into certain categories: - * :obj:`successful` - * :obj:`failed` - * :obj:`skipped` - * :obj:`missing` + * :obj:`~successful` + * :obj:`~failed` + * :obj:`~skipped` + * :obj:`~missing` + + .. versionadded:: 0.3.0 Examples -------- diff --git a/onetl/file/filter/exclude_dir.py b/onetl/file/filter/exclude_dir.py index a7d9007a5..f5b096d2c 100644 --- a/onetl/file/filter/exclude_dir.py +++ b/onetl/file/filter/exclude_dir.py @@ -16,6 +16,9 @@ class ExcludeDir(BaseFileFilter, FrozenModel): """Filter files or directories which are included in a specific directory. + .. versionadded:: 0.8.0 + Replaces deprecated ``onetl.core.FileFilter`` + Parameters ---------- diff --git a/onetl/file/filter/glob.py b/onetl/file/filter/glob.py index 768261e60..db622cfd3 100644 --- a/onetl/file/filter/glob.py +++ b/onetl/file/filter/glob.py @@ -16,6 +16,9 @@ class Glob(BaseFileFilter, FrozenModel): """Filter files or directories with path matching a glob expression. + .. versionadded:: 0.8.0 + Replaces deprecated ``onetl.core.FileFilter`` + Parameters ---------- diff --git a/onetl/file/filter/match_all_filters.py b/onetl/file/filter/match_all_filters.py index 6695bbf2a..484bee93f 100644 --- a/onetl/file/filter/match_all_filters.py +++ b/onetl/file/filter/match_all_filters.py @@ -14,6 +14,8 @@ def match_all_filters(path: PathProtocol, filters: Iterable[BaseFileFilter]) -> """ Check if input path satisfies all the filters. + .. versionadded:: 0.8.0 + Parameters ---------- path : :obj:`onetl.base.path_protocol.PathProtocol` diff --git a/onetl/file/filter/regexp.py b/onetl/file/filter/regexp.py index 0c64001da..48e321ad1 100644 --- a/onetl/file/filter/regexp.py +++ b/onetl/file/filter/regexp.py @@ -17,6 +17,9 @@ class Regexp(BaseFileFilter, FrozenModel): r"""Filter files or directories with path matching a regular expression. + .. versionadded:: 0.8.0 + Replaces deprecated ``onetl.core.FileFilter`` + Parameters ---------- diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 305603791..2657e991f 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -75,6 +75,8 @@ class Avro(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- @@ -209,6 +211,8 @@ def parse_column(self, column: str | Column) -> Column: pip install requests + .. versionadded:: 0.11.0 + Parameters ---------- column : str | Column @@ -309,6 +313,8 @@ def serialize_column(self, column: str | Column) -> Column: pip install requests + .. versionadded:: 0.11.0 + Parameters ---------- column : str | Column diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index 9ed1579cb..353a8e987 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -85,6 +85,8 @@ class CSV(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- @@ -123,6 +125,8 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: Can be used only with Spark 3.x+ + .. versionadded:: 0.11.0 + Parameters ---------- column : str | Column @@ -201,6 +205,8 @@ def serialize_column(self, column: str | Column) -> Column: Can be used only with Spark 3.x+ + .. versionadded:: 0.11.0 + Parameters ---------- column : str | Column diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index ee7bfe74c..3f52ae570 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -76,6 +76,8 @@ class Excel(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.4 + Examples -------- diff --git a/onetl/file/format/json.py b/onetl/file/format/json.py index 630865232..698874424 100644 --- a/onetl/file/format/json.py +++ b/onetl/file/format/json.py @@ -80,6 +80,8 @@ class JSON(ReadOnlyFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- @@ -110,6 +112,8 @@ def parse_column(self, column: str | Column, schema: StructType | ArrayType | Ma """ Parses a JSON string column to a structured Spark SQL column using Spark's `from_json `_ function, based on the provided schema. + .. versionadded:: 0.11.0 + Parameters ---------- column : str | Column @@ -184,6 +188,8 @@ def serialize_column(self, column: str | Column) -> Column: Serializes a structured Spark SQL column into a JSON string column using Spark's `to_json `_ function. + .. versionadded:: 0.11.0 + Parameters ---------- column : str | Column diff --git a/onetl/file/format/jsonline.py b/onetl/file/format/jsonline.py index 9bfd84159..1d1c910d1 100644 --- a/onetl/file/format/jsonline.py +++ b/onetl/file/format/jsonline.py @@ -72,6 +72,8 @@ class JSONLine(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/file/format/orc.py b/onetl/file/format/orc.py index a292b6c83..f108a1506 100644 --- a/onetl/file/format/orc.py +++ b/onetl/file/format/orc.py @@ -45,6 +45,8 @@ class ORC(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/file/format/parquet.py b/onetl/file/format/parquet.py index 460819463..f96ad4445 100644 --- a/onetl/file/format/parquet.py +++ b/onetl/file/format/parquet.py @@ -49,6 +49,8 @@ class Parquet(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 5165adfca..f0cd647c4 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -107,6 +107,8 @@ class XML(ReadWriteFileFormat): Using ``mode=FAILFAST`` will throw an exception instead of producing ``null`` values. `Follow `_ + .. versionadded:: 0.9.5 + Examples -------- Describe options how to read from/write to XML file with specific options: @@ -276,6 +278,8 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: ], ) + .. versionadded:: 0.11.0 + Parameters ---------- column : str | Column diff --git a/onetl/file/limit/limits_reached.py b/onetl/file/limit/limits_reached.py index 2f3833d63..27d7fb502 100644 --- a/onetl/file/limit/limits_reached.py +++ b/onetl/file/limit/limits_reached.py @@ -14,6 +14,8 @@ def limits_reached(limits: Iterable[BaseFileLimit]) -> bool: """ Check if any of limits reached. + .. versionadded:: 0.8.0 + Parameters ---------- limits : Iterable of :obj:`onetl.base.base_file_limit.BaseFileLimit` diff --git a/onetl/file/limit/limits_stop_at.py b/onetl/file/limit/limits_stop_at.py index 2ff1f7945..035ac6424 100644 --- a/onetl/file/limit/limits_stop_at.py +++ b/onetl/file/limit/limits_stop_at.py @@ -14,6 +14,8 @@ def limits_stop_at(path: PathProtocol, limits: Iterable[BaseFileLimit]) -> bool: """ Check if some of limits stops at given path. + .. versionadded:: 0.8.0 + Parameters ---------- path : :obj:`onetl.base.path_protocol.PathProtocol` diff --git a/onetl/file/limit/max_files_count.py b/onetl/file/limit/max_files_count.py index 5f2f584d6..ec604ff41 100644 --- a/onetl/file/limit/max_files_count.py +++ b/onetl/file/limit/max_files_count.py @@ -13,6 +13,9 @@ class MaxFilesCount(BaseFileLimit, FrozenModel): """Limits the total number of files handled by :ref:`file-downloader` or :ref:`file-mover`. + .. versionadded:: 0.8.0 + Replaces deprecated ``onetl.core.FileLimit`` + Parameters ---------- diff --git a/onetl/hooks/hook.py b/onetl/hooks/hook.py index f5163be65..452003848 100644 --- a/onetl/hooks/hook.py +++ b/onetl/hooks/hook.py @@ -24,6 +24,8 @@ class HookPriority(int, Enum): Hook priority enum. All hooks within the same priority are executed in the same order they were registered. + + .. versionadded:: 0.7.0 """ FIRST = -1 @@ -41,6 +43,8 @@ class Hook(Generic[T]): # noqa: WPS338 """ Hook representation. + .. versionadded:: 0.7.0 + Parameters ---------- @@ -81,6 +85,8 @@ def enable(self): """ Enable the hook. + .. versionadded:: 0.7.0 + Examples -------- @@ -107,6 +113,8 @@ def disable(self): """ Disable the hook. + .. versionadded:: 0.7.0 + Examples -------- @@ -141,6 +149,8 @@ def skip(self): You should call :obj:`~enable` explicitly to change its state. + .. versionadded:: 0.7.0 + Examples -------- @@ -365,6 +375,8 @@ def hook(inp: Callable[..., T] | None = None, enabled: bool = True, priority: Ho """ Initialize hook from callable/context manager. + .. versionadded:: 0.7.0 + Examples -------- diff --git a/onetl/hooks/hook_collection.py b/onetl/hooks/hook_collection.py index f5b913966..d715086c6 100644 --- a/onetl/hooks/hook_collection.py +++ b/onetl/hooks/hook_collection.py @@ -16,6 +16,8 @@ class HookCollection: """ Representation of hooks collection. + .. versionadded:: 0.7.0 + Examples -------- @@ -38,6 +40,8 @@ def active(self): If called after :obj:`~stop` or inside :obj:`~skip`, empty collection will be returned. + .. versionadded:: 0.7.0 + Examples -------- @@ -64,6 +68,8 @@ def stop(self) -> None: """ Stop all hooks in the collection. + .. versionadded:: 0.7.0 + Examples -------- @@ -92,6 +98,8 @@ def resume(self) -> None: If hook is disabled by :obj:`onetl.hooks.hook.Hook.disable`, it will stay disabled. You should call :obj:`onetl.hooks.hook.Hook.enable` explicitly. + .. versionadded:: 0.7.0 + Examples -------- @@ -128,6 +136,8 @@ def skip(self): after exiting the context/decorated function. You should call :obj:`~resume` explicitly. + .. versionadded:: 0.7.0 + Examples -------- @@ -165,6 +175,8 @@ def skip(self): def add(self, item: Hook): """Appends hook to the collection. + .. versionadded:: 0.7.0 + Examples -------- @@ -188,6 +200,8 @@ def add(self, item: Hook): def extend(self, hooks: Iterable[Hook]): """Extends collection using a iterator. + .. versionadded:: 0.7.0 + Examples -------- @@ -211,6 +225,8 @@ def extend(self, hooks: Iterable[Hook]): def __iter__(self): """Iterate over hooks in the collection. + .. versionadded:: 0.7.0 + Examples -------- @@ -234,6 +250,8 @@ def __iter__(self): def __len__(self): """Return collection length. + .. versionadded:: 0.7.0 + Examples -------- diff --git a/onetl/hooks/hooks_state.py b/onetl/hooks/hooks_state.py index de46b33c0..53a2c0c39 100644 --- a/onetl/hooks/hooks_state.py +++ b/onetl/hooks/hooks_state.py @@ -22,6 +22,8 @@ def stop(cls) -> None: """ Stop all hooks for all classes. + .. versionadded:: 0.7.0 + Examples -------- @@ -51,6 +53,8 @@ def resume(cls) -> None: This function does not enable hooks which were disabled by :obj:`onetl.hooks.hook.Hook.disable`, or stopped by :obj:`onetl.hooks.support_hooks.suspend_hooks`. + .. versionadded:: 0.7.0 + Examples -------- @@ -84,6 +88,8 @@ def skip(cls): after exiting the context/decorated function. You should call :obj:`~resume_all_hooks` explicitly. + .. versionadded:: 0.7.0 + Examples -------- diff --git a/onetl/hooks/slot.py b/onetl/hooks/slot.py index cf9171f63..6d4b0b879 100644 --- a/onetl/hooks/slot.py +++ b/onetl/hooks/slot.py @@ -67,6 +67,8 @@ def bind_hook(method: Callable, inp=None): See :ref:`hooks-design` for more details. + .. versionadded:: 0.7.0 + Examples -------- @@ -255,6 +257,8 @@ def register_slot(cls: type, method_name: str): # noqa: WPS231, WPS213, WPS212 Also ``@classmethod`` is a descriptor, and it can be called only my accessing the class itself, which is not possible within a decorator. + .. versionadded:: 0.7.0 + Examples -------- @@ -465,7 +469,10 @@ def is_slot(method: Callable) -> bool: class Slot(Protocol): - """Protocol which is implemented by a method after applying :obj:`~slot` decorator.""" + """Protocol which is implemented by a method after applying :obj:`~slot` decorator. + + .. versionadded:: 0.7.0 + """ def __call__(self, *args, **kwargs): ... @@ -637,6 +644,8 @@ def slot(method: Method) -> Method: It is not allowed to use this decorator over ``_private`` and ``__protected`` methods and ``@property``. But is allowed to use on ``__dunder__`` methods, like ``__init__``. + .. versionadded:: 0.7.0 + Examples --------- diff --git a/onetl/hooks/support_hooks.py b/onetl/hooks/support_hooks.py index 052ac64fd..33d440b55 100644 --- a/onetl/hooks/support_hooks.py +++ b/onetl/hooks/support_hooks.py @@ -91,6 +91,8 @@ def with_all_hooks_disabled(): # running outside a decorated function restores previous behavior obj = MyClass() obj.my_method(2) # will execute callback(obj, 2) + + .. versionadded:: 0.7.0 """ slots = get_slots(cls) @@ -126,6 +128,8 @@ def callback(self, arg): ... MyClass.suspend_hooks() obj.my_method(2) # will NOT execute callback + + .. versionadded:: 0.7.0 """ slots = get_slots(cls) @@ -161,6 +165,8 @@ def callback(self, arg): ... MyClass.resume_hooks() obj.my_method(2) # will execute callback(obj, 2) + + .. versionadded:: 0.7.0 """ slots = get_slots(cls) @@ -177,6 +183,8 @@ def support_hooks(cls: Klass) -> Klass: Adds :obj:`~skip_hooks`, :obj:`~suspend_hooks` and :obj:`~resume_hooks` to the class. + .. versionadded:: 0.7.0 + Examples --------- diff --git a/onetl/log.py b/onetl/log.py index 32c239b8b..aeed6403f 100644 --- a/onetl/log.py +++ b/onetl/log.py @@ -56,6 +56,9 @@ def setup_notebook_logging(level: int | str = logging.INFO) -> None: Should **NOT** be used in applications, you should set up logging settings manually, according to your framework documentation. + .. deprecated:: 0.5.0 + Use :obj:`~setup_logging` instead + Parameters ---------- level : ``int`` or ``str``, default ``INFO`` @@ -80,6 +83,9 @@ def setup_logging(level: int | str = logging.INFO, enable_clients: bool = False) Should be used only in IDEs (like Jupyter notebooks or PyCharm), or scripts (ETL pipelines). + .. versionchanged:: 0.5.0 + Renamed ``setup_notebook_logging`` → ``setup_logging`` + Parameters ---------- level : ``int`` or ``str``, default ``INFO`` @@ -92,6 +98,8 @@ def setup_logging(level: int | str = logging.INFO, enable_clients: bool = False) .. note:: For ``level="DEBUG"`` it is recommended to use ``enable_clients=True`` + + .. versionadded:: 0.9.0 """ logging.basicConfig(level=level) @@ -118,6 +126,9 @@ def setup_clients_logging(level: int | str = DISABLED) -> None: Can be used in applications, but it is recommended to set up these loggers according to your framework documentation. + .. versionchanged:: 0.9.0 + Renamed ``disable_clients_logging`` → ``setup_clients_logging`` + Parameters ---------- level : ``int`` or ``str``, default ``DISABLED`` @@ -127,6 +138,8 @@ def setup_clients_logging(level: int | str = DISABLED) -> None: For ``py4j``, logging level with maximum verbosity is ``INFO`` because ``DEBUG`` logs are totally unreadable. + + .. versionadded:: 0.9.0 """ for client_module in CLIENT_MODULES: diff --git a/onetl/strategy/incremental_strategy.py b/onetl/strategy/incremental_strategy.py index d6999e96e..0397514b6 100644 --- a/onetl/strategy/incremental_strategy.py +++ b/onetl/strategy/incremental_strategy.py @@ -139,6 +139,8 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): * FileDownloader creates files on local filesystem, and file content may differ for different :obj:`modes `. * It can remove files from the source if :obj:`delete_source ` is set to ``True``. + .. versionadded:: 0.1.0 + Parameters ---------- offset : Any, default: ``None`` @@ -400,6 +402,8 @@ class IncrementalBatchStrategy(OffsetMixin, BatchHWMStrategy): supports batch strategy. For example, Kafka connection doesn't support it. Make sure the connection you use is compatible with the IncrementalBatchStrategy. + .. versionadded:: 0.1.0 + Parameters ---------- step : Any diff --git a/onetl/strategy/snapshot_strategy.py b/onetl/strategy/snapshot_strategy.py index cf5a0d154..77ed4b35b 100644 --- a/onetl/strategy/snapshot_strategy.py +++ b/onetl/strategy/snapshot_strategy.py @@ -46,6 +46,8 @@ class SnapshotStrategy(BaseStrategy): }, ) + .. versionadded:: 0.1.0 + Examples -------- @@ -158,6 +160,8 @@ class SnapshotBatchStrategy(BatchHWMStrategy): supports batch strategy. For example, Kafka connection doesn't support it. Make sure the connection you use is compatible with the SnapshotBatchStrategy. + .. versionadded:: 0.1.0 + Parameters ---------- step : Any From 49a56869c254872e36b5bb4af3ccd71888bec10c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 27 May 2024 09:21:59 +0000 Subject: [PATCH 69/71] [DOP-16174] Fix changelog --- docs/changelog/0.11.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changelog/0.11.0.rst b/docs/changelog/0.11.0.rst index e74ba41fa..8abdc2b7a 100644 --- a/docs/changelog/0.11.0.rst +++ b/docs/changelog/0.11.0.rst @@ -220,6 +220,8 @@ Few documentation improvements. - Add note about connecting to Clickhouse cluster. (:github:pull:`280`). +- Add notes about versions when specific class/method/attribute/argument was added, renamed or changed behavior (:github:`282`). + Bug Fixes --------- From 1c4e83ddf7e7f4602b7b45f94c6d74d348c5792a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 27 May 2024 09:27:26 +0000 Subject: [PATCH 70/71] [DOP-16174] Fix changelog --- onetl/file/format/excel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index 3f52ae570..e41fa801f 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -53,8 +53,6 @@ class Excel(ReadWriteFileFormat): Supports reading/writing files with ``.xlsx`` (read/write) and ``.xls`` (read only) extensions. - .. versionadded:: 0.9.4 - .. dropdown:: Version compatibility * Spark versions: 3.2.x - 3.5.x From 48af27e28fbd67ffc18a1763dc4f505831e23ab0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 27 May 2024 09:35:47 +0000 Subject: [PATCH 71/71] [DOP-16174] Add versionadded to Avro, XML and Excel get_packages method --- onetl/file/format/avro.py | 2 ++ onetl/file/format/excel.py | 2 ++ onetl/file/format/xml.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 2657e991f..3699620b0 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -131,6 +131,8 @@ def get_packages( See `Maven package index `_ for all available packages. + .. versionadded:: 0.9.0 + Parameters ---------- spark_version : str diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index e41fa801f..2ec12758a 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -126,6 +126,8 @@ def get_packages( See `Maven index `_ and `official documentation `_. + .. versionadded:: 0.9.4 + Parameters ---------- spark_version : str diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index f0cd647c4..cc7cd4777 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -150,6 +150,8 @@ def get_packages( # noqa: WPS231 """ Get package names to be downloaded by Spark. |support_hooks| + .. versionadded:: 0.9.5 + Parameters ---------- spark_version : str