From 47489df8a2ffdc14b5770fec672cc39e7f1674b5 Mon Sep 17 00:00:00 2001 From: Jon Fearer Date: Mon, 20 Dec 2021 13:11:17 -0700 Subject: [PATCH] fix(bigquery): fix big query to mssql/mysql transfer issues --- .../providers/google/cloud/hooks/bigquery.py | 9 ++- .../cloud/transfers/bigquery_to_mssql.py | 67 +++++++------------ .../cloud/transfers/bigquery_to_mysql.py | 60 ++++++----------- .../google/cloud/utils/bigquery_get_data.py | 56 ++++++++++++++++ docs/spelling_wordlist.txt | 1 + .../cloud/transfers/test_bigquery_to_mssql.py | 15 ++--- 6 files changed, 119 insertions(+), 89 deletions(-) create mode 100644 airflow/providers/google/cloud/utils/bigquery_get_data.py diff --git a/airflow/providers/google/cloud/hooks/bigquery.py b/airflow/providers/google/cloud/hooks/bigquery.py index b32971d3aca85..25da5470e82ec 100644 --- a/airflow/providers/google/cloud/hooks/bigquery.py +++ b/airflow/providers/google/cloud/hooks/bigquery.py @@ -1299,7 +1299,14 @@ def get_tabledata( :return: list of rows """ warnings.warn("This method is deprecated. Please use `list_rows`.", DeprecationWarning) - rows = self.list_rows(dataset_id, table_id, max_results, selected_fields, page_token, start_index) + rows = self.list_rows( + dataset_id=dataset_id, + table_id=table_id, + max_results=max_results, + selected_fields=selected_fields, + page_token=page_token, + start_index=start_index, + ) return [dict(r) for r in rows] @GoogleBaseHook.fallback_to_default_project_id diff --git a/airflow/providers/google/cloud/transfers/bigquery_to_mssql.py b/airflow/providers/google/cloud/transfers/bigquery_to_mssql.py index bd06b85e232bf..42dc11e6baf8a 100644 --- a/airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +++ b/airflow/providers/google/cloud/transfers/bigquery_to_mssql.py @@ -16,12 +16,11 @@ # specific language governing permissions and limitations # under the License. """This module contains Google BigQuery to MSSQL operator.""" -from typing import Optional, Sequence, Union - -from google.cloud.bigquery.table import TableReference +from typing import List, Optional, Sequence, Union from airflow.models import BaseOperator from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook +from airflow.providers.google.cloud.utils.bigquery_get_data import bigquery_get_data from airflow.providers.microsoft.mssql.hooks.mssql import MsSqlHook @@ -79,7 +78,7 @@ class BigQueryToMsSqlOperator(BaseOperator): If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). - :type impersonation_chain: Union[str, Sequence[str]] + :type impersonation_chain: str | Sequence[str] """ template_fields = ('source_project_dataset_table', 'mssql_table', 'impersonation_chain') @@ -89,7 +88,7 @@ def __init__( *, source_project_dataset_table: str, mssql_table: str, - selected_fields: Optional[str] = None, + selected_fields: Optional[Union[List[str], str]] = None, gcp_conn_id: str = 'google_cloud_default', mssql_conn_id: str = 'mssql_default', database: Optional[str] = None, @@ -111,47 +110,33 @@ def __init__( self.batch_size = batch_size self.location = location self.impersonation_chain = impersonation_chain + try: + _, self.dataset_id, self.table_id = source_project_dataset_table.split('.') + except ValueError: + raise ValueError( + f'Could not parse {source_project_dataset_table} as ..' + ) from None self.source_project_dataset_table = source_project_dataset_table - def _bq_get_data(self): - - hook = BigQueryHook( + def execute(self, context) -> None: + big_query_hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) - table_ref = TableReference.from_string(self.source_project_dataset_table) - self.log.info('Fetching Data from:') - self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id, table_ref.table_id) - - conn = hook.get_conn() - cursor = conn.cursor() - i = 0 - while True: - response = cursor.get_tabledata( - dataset_id=table_ref.dataset_id, - table_id=table_ref.table_id, - max_results=self.batch_size, - selected_fields=self.selected_fields, - start_index=i * self.batch_size, - ) - - if 'rows' not in response: - self.log.info('Job Finished') - return - - rows = response['rows'] - - self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) - - table_data = [] - table_data = [[fields['v'] for fields in dict_row['f']] for dict_row in rows] - - yield table_data - i += 1 - - def execute(self, context): mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id, schema=self.database) - for rows in self._bq_get_data(): - mssql_hook.insert_rows(self.mssql_table, rows, replace=self.replace) + for rows in bigquery_get_data( + self.log, + self.dataset_id, + self.table_id, + big_query_hook, + self.batch_size, + self.selected_fields, + ): + mssql_hook.insert_rows( + table=self.mssql_table, + rows=rows, + target_fields=self.selected_fields, + replace=self.replace, + ) diff --git a/airflow/providers/google/cloud/transfers/bigquery_to_mysql.py b/airflow/providers/google/cloud/transfers/bigquery_to_mysql.py index be0de9ab7d51d..686e838d1423d 100644 --- a/airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +++ b/airflow/providers/google/cloud/transfers/bigquery_to_mysql.py @@ -16,10 +16,11 @@ # specific language governing permissions and limitations # under the License. """This module contains Google BigQuery to MySQL operator.""" -from typing import Optional, Sequence, Union +from typing import List, Optional, Sequence, Union from airflow.models import BaseOperator from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook +from airflow.providers.google.cloud.utils.bigquery_get_data import bigquery_get_data from airflow.providers.mysql.hooks.mysql import MySqlHook @@ -76,7 +77,7 @@ class BigQueryToMySqlOperator(BaseOperator): If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). - :type impersonation_chain: Union[str, Sequence[str]] + :type impersonation_chain: str | Sequence[str] """ template_fields = ( @@ -91,7 +92,7 @@ def __init__( *, dataset_table: str, mysql_table: str, - selected_fields: Optional[str] = None, + selected_fields: Optional[Union[List[str], str]] = None, gcp_conn_id: str = 'google_cloud_default', mysql_conn_id: str = 'mysql_default', database: Optional[str] = None, @@ -116,46 +117,27 @@ def __init__( try: self.dataset_id, self.table_id = dataset_table.split('.') except ValueError: - raise ValueError(f'Could not parse {dataset_table} as .
') + raise ValueError(f'Could not parse {dataset_table} as .
') from None - def _bq_get_data(self): - self.log.info('Fetching Data from:') - self.log.info('Dataset: %s ; Table: %s', self.dataset_id, self.table_id) - - hook = BigQueryHook( + def execute(self, context) -> None: + big_query_hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) - - i = 0 - while True: - response = hook.list_rows( - dataset_id=self.dataset_id, - table_id=self.table_id, - max_results=self.batch_size, - selected_fields=self.selected_fields, - start_index=i * self.batch_size, - ) - rows = [dict(r) for r in response] - if len(rows) == 0: - self.log.info('Job Finished') - return - - self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) - - table_data = [] - for dict_row in rows: - single_row = [] - for fields in dict_row['f']: - single_row.append(fields['v']) - table_data.append(single_row) - - yield table_data - i += 1 - - def execute(self, context): mysql_hook = MySqlHook(schema=self.database, mysql_conn_id=self.mysql_conn_id) - for rows in self._bq_get_data(): - mysql_hook.insert_rows(self.mysql_table, rows, replace=self.replace) + for rows in bigquery_get_data( + self.log, + self.dataset_id, + self.table_id, + big_query_hook, + self.batch_size, + self.selected_fields, + ): + mysql_hook.insert_rows( + table=self.mysql_table, + rows=rows, + target_fields=self.selected_fields, + replace=self.replace, + ) diff --git a/airflow/providers/google/cloud/utils/bigquery_get_data.py b/airflow/providers/google/cloud/utils/bigquery_get_data.py new file mode 100644 index 0000000000000..b864c0fc62564 --- /dev/null +++ b/airflow/providers/google/cloud/utils/bigquery_get_data.py @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections.abc import Iterator +from logging import Logger +from typing import List, Union + +from google.cloud.bigquery.table import Row + +from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook + + +def bigquery_get_data( + logger: Logger, + dataset_id: str, + table_id: str, + big_query_hook: BigQueryHook, + batch_size: int, + selected_fields: Union[List, str], +) -> Iterator: + logger.info('Fetching Data from:') + logger.info('Dataset: %s ; Table: %s', dataset_id, table_id) + + i = 0 + while True: + rows: List[Row] = big_query_hook.list_rows( + dataset_id=dataset_id, + table_id=table_id, + max_results=batch_size, + selected_fields=selected_fields, + start_index=i * batch_size, + ) + + if len(rows) == 0: + logger.info('Job Finished') + return + + logger.info('Total Extracted rows: %s', len(rows) + i * batch_size) + + yield [row.values() for row in rows] + + i += 1 diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 4703bd2408073..430132b645db6 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -35,6 +35,7 @@ BaseOperator BaseView Beauchemin Behaviour +BigQueryHook Bigquery Bigtable Bitshift diff --git a/tests/providers/google/cloud/transfers/test_bigquery_to_mssql.py b/tests/providers/google/cloud/transfers/test_bigquery_to_mssql.py index 01a54bbb67885..5f88c0cedff66 100644 --- a/tests/providers/google/cloud/transfers/test_bigquery_to_mssql.py +++ b/tests/providers/google/cloud/transfers/test_bigquery_to_mssql.py @@ -40,12 +40,11 @@ def test_execute_good_request_to_bq(self, mock_hook): operator.execute(None) # fmt: off - mock_hook.return_value.get_conn.return_value.cursor.return_value.get_tabledata\ - .assert_called_once_with( - dataset_id=TEST_DATASET, - table_id=TEST_TABLE_ID, - max_results=1000, - selected_fields=None, - start_index=0, - ) + mock_hook.return_value.list_rows.assert_called_once_with( + dataset_id=TEST_DATASET, + table_id=TEST_TABLE_ID, + max_results=1000, + selected_fields=None, + start_index=0, + ) # fmt: on