diff --git a/CHANGELOG.md b/CHANGELOG.md index f5b13285..00370b38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # Changelog +## [0.24.0](https://github.com/googleapis/python-bigquery-pandas/compare/v0.23.2...v0.24.0) (2024-10-14) + + +### ⚠ BREAKING CHANGES + +* `to_gbq` loads naive (no timezone) columns to BigQuery DATETIME instead of TIMESTAMP ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) +* `to_gbq` loads object column containing bool values to BOOLEAN instead of STRING ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) +* `to_gbq` loads object column containing dictionary values to STRUCT instead of STRING ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) +* `to_gbq` loads `unit8` columns to BigQuery INT64 instead of STRING ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) + +### Features + +* Adds the capability to include custom user agent string ([#819](https://github.com/googleapis/python-bigquery-pandas/issues/819)) ([d43457b](https://github.com/googleapis/python-bigquery-pandas/commit/d43457b3838bdc135337cae47c56af397bb1d6d1)) + + +### Bug Fixes + +* `to_gbq` loads `unit8` columns to BigQuery INT64 instead of STRING ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) ([107bb40](https://github.com/googleapis/python-bigquery-pandas/commit/107bb40218b531be1a4f646b8fb0cea5bdfd8aee)) +* `to_gbq` loads naive (no timezone) columns to BigQuery DATETIME instead of TIMESTAMP ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) ([107bb40](https://github.com/googleapis/python-bigquery-pandas/commit/107bb40218b531be1a4f646b8fb0cea5bdfd8aee)) +* `to_gbq` loads object column containing bool values to BOOLEAN instead of STRING ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) ([107bb40](https://github.com/googleapis/python-bigquery-pandas/commit/107bb40218b531be1a4f646b8fb0cea5bdfd8aee)) +* `to_gbq` loads object column containing dictionary values to STRUCT instead of STRING ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) ([107bb40](https://github.com/googleapis/python-bigquery-pandas/commit/107bb40218b531be1a4f646b8fb0cea5bdfd8aee)) + + +### Dependencies + +* Min pyarrow is now 4.0.0 to support compliant nested types ([#814](https://github.com/googleapis/python-bigquery-pandas/issues/814)) ([107bb40](https://github.com/googleapis/python-bigquery-pandas/commit/107bb40218b531be1a4f646b8fb0cea5bdfd8aee)) + ## [0.23.2](https://github.com/googleapis/python-bigquery-pandas/compare/v0.23.1...v0.23.2) (2024-09-20) diff --git a/noxfile.py b/noxfile.py index d316dac8..461b761c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -51,6 +51,7 @@ UNIT_TEST_EXTRAS = [ "bqstorage", "tqdm", + "geopandas", ] UNIT_TEST_EXTRAS_BY_PYTHON = { "3.9": [], @@ -207,6 +208,7 @@ def default(session): session.run( "py.test", "--quiet", + "-W default::PendingDeprecationWarning", f"--junitxml=unit_{session.python}_sponge_log.xml", "--cov=pandas_gbq", "--cov=tests/unit", @@ -289,6 +291,7 @@ def system(session): session.run( "py.test", "--quiet", + "-W default::PendingDeprecationWarning", f"--junitxml=system_{session.python}_sponge_log.xml", system_test_path, *session.posargs, @@ -297,6 +300,7 @@ def system(session): session.run( "py.test", "--quiet", + "-W default::PendingDeprecationWarning", f"--junitxml=system_{session.python}_sponge_log.xml", system_test_folder_path, *session.posargs, @@ -371,6 +375,7 @@ def prerelease(session): session.run( "py.test", "--quiet", + "-W default::PendingDeprecationWarning", f"--junitxml=prerelease_unit_{session.python}_sponge_log.xml", os.path.join("tests", "unit"), *session.posargs, @@ -379,6 +384,7 @@ def prerelease(session): session.run( "py.test", "--quiet", + "-W default::PendingDeprecationWarning", f"--junitxml=prerelease_system_{session.python}_sponge_log.xml", os.path.join("tests", "system"), *session.posargs, diff --git a/owlbot.py b/owlbot.py index 916a7074..190298a6 100644 --- a/owlbot.py +++ b/owlbot.py @@ -32,7 +32,7 @@ # Use a middle version of Python to test when no extras are installed. "3.9": [] } -extras = ["tqdm"] +extras = ["tqdm", "geopandas"] templated_files = common.py_library( unit_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12"], system_test_python_versions=["3.8", "3.9", "3.10", "3.11", "3.12"], diff --git a/pandas_gbq/__init__.py b/pandas_gbq/__init__.py index 6d92bfa2..76c33d60 100644 --- a/pandas_gbq/__init__.py +++ b/pandas_gbq/__init__.py @@ -2,10 +2,24 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +import warnings + from pandas_gbq import version as pandas_gbq_version +from . import _versions_helpers from .gbq import Context, context, read_gbq, to_gbq # noqa +sys_major, sys_minor, sys_micro = _versions_helpers.extract_runtime_version() +if sys_major == 3 and sys_minor in (7, 8): + warnings.warn( + "The python-bigquery library will stop supporting Python 3.7 " + "and Python 3.8 in a future major release expected in Q4 2024. " + f"Your Python version is {sys_major}.{sys_minor}.{sys_micro}. We " + "recommend that you update soon to ensure ongoing support. For " + "more details, see: [Google Cloud Client Libraries Supported Python Versions policy](https://cloud.google.com/python/docs/supported-python-versions)", + PendingDeprecationWarning, + ) + __version__ = pandas_gbq_version.__version__ __all__ = [ diff --git a/pandas_gbq/_versions_helpers.py b/pandas_gbq/_versions_helpers.py new file mode 100644 index 00000000..37247c45 --- /dev/null +++ b/pandas_gbq/_versions_helpers.py @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared helper functions for verifying versions of installed modules.""" + + +import sys +from typing import Tuple + + +def extract_runtime_version() -> Tuple[int, int, int]: + # Retrieve the version information + version_info = sys.version_info + + # Extract the major, minor, and micro components + major = version_info.major + minor = version_info.minor + micro = version_info.micro + + # Display the version number in a clear format + return major, minor, micro diff --git a/pandas_gbq/core/__init__.py b/pandas_gbq/core/__init__.py new file mode 100644 index 00000000..02d26e8e --- /dev/null +++ b/pandas_gbq/core/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. diff --git a/pandas_gbq/core/pandas.py b/pandas_gbq/core/pandas.py new file mode 100644 index 00000000..37557adf --- /dev/null +++ b/pandas_gbq/core/pandas.py @@ -0,0 +1,70 @@ +# Copyright (c) 2019 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import itertools + +import pandas + + +def list_columns_and_indexes(dataframe, index=True): + """Return all index and column names with dtypes. + + Returns: + Sequence[Tuple[str, dtype]]: + Returns a sorted list of indexes and column names with + corresponding dtypes. If an index is missing a name or has the + same name as a column, the index is omitted. + """ + column_names = frozenset(dataframe.columns) + columns_and_indexes = [] + if index: + if isinstance(dataframe.index, pandas.MultiIndex): + for name in dataframe.index.names: + if name and name not in column_names: + values = dataframe.index.get_level_values(name) + columns_and_indexes.append((name, values.dtype)) + else: + if dataframe.index.name and dataframe.index.name not in column_names: + columns_and_indexes.append( + (dataframe.index.name, dataframe.index.dtype) + ) + + columns_and_indexes += zip(dataframe.columns, dataframe.dtypes) + return columns_and_indexes + + +def first_valid(series): + first_valid_index = series.first_valid_index() + if first_valid_index is not None: + return series.at[first_valid_index] + + +def first_array_valid(series): + """Return the first "meaningful" element from the array series. + + Here, "meaningful" means the first non-None element in one of the arrays that can + be used for type detextion. + """ + first_valid_index = series.first_valid_index() + if first_valid_index is None: + return None + + valid_array = series.at[first_valid_index] + valid_item = next((item for item in valid_array if not pandas.isna(item)), None) + + if valid_item is not None: + return valid_item + + # Valid item is None because all items in the "valid" array are invalid. Try + # to find a true valid array manually. + for array in itertools.islice(series, first_valid_index + 1, None): + try: + array_iter = iter(array) + except TypeError: + continue # Not an array, apparently, e.g. None, thus skip. + valid_item = next((item for item in array_iter if not pandas.isna(item)), None) + if valid_item is not None: + break + + return valid_item diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 19c42a6b..b04ad131 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -25,6 +25,7 @@ from pandas_gbq.features import FEATURES import pandas_gbq.query import pandas_gbq.schema +import pandas_gbq.schema.pandas_to_bigquery import pandas_gbq.timestamp try: @@ -266,6 +267,8 @@ def __init__( auth_redirect_uri=None, client_id=None, client_secret=None, + user_agent=None, + rfc9110_delimiter=False, ): global context from google.api_core.exceptions import ClientError, GoogleAPIError @@ -283,6 +286,8 @@ def __init__( self.auth_redirect_uri = auth_redirect_uri self.client_id = client_id self.client_secret = client_secret + self.user_agent = user_agent + self.rfc9110_delimiter = rfc9110_delimiter default_project = None @@ -336,11 +341,15 @@ def log_elapsed_seconds(self, prefix="Elapsed", postfix="s.", overlong=6): def get_client(self): import google.api_core.client_info - import pandas bigquery = FEATURES.bigquery_try_import() + + user_agent = create_user_agent( + user_agent=self.user_agent, rfc9110_delimiter=self.rfc9110_delimiter + ) + client_info = google.api_core.client_info.ClientInfo( - user_agent="pandas-{}".format(pandas.__version__) + user_agent=user_agent, ) return bigquery.Client( project=self.project_id, @@ -960,6 +969,8 @@ def to_gbq( auth_redirect_uri=None, client_id=None, client_secret=None, + user_agent=None, + rfc9110_delimiter=False, ): """Write a DataFrame to a Google BigQuery table. @@ -1071,6 +1082,13 @@ def to_gbq( client_secret : str The Client Secret associated with the Client ID for the Google Cloud Project the user is attempting to connect to. + user_agent : str + Custom user agent string used as a prefix to the pandas version. + rfc9110_delimiter : bool + Sets user agent delimiter to a hyphen or a slash. + Default is False, meaning a hyphen will be used. + + .. versionadded:: 0.23.3 """ _test_google_api_imports() @@ -1129,6 +1147,8 @@ def to_gbq( auth_redirect_uri=auth_redirect_uri, client_id=client_id, client_secret=client_secret, + user_agent=user_agent, + rfc9110_delimiter=rfc9110_delimiter, ) bqclient = connector.client @@ -1219,9 +1239,16 @@ def _generate_bq_schema(df, default_type="STRING"): be overridden: https://github.com/pydata/pandas-gbq/issues/218, this method can be removed after there is time to migrate away from this method.""" - from pandas_gbq import schema + fields = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields( + df, + default_type=default_type, + ) + fields_json = [] + + for field in fields: + fields_json.append(field.to_api_repr()) - return schema.generate_bq_schema(df, default_type=default_type) + return {"fields": fields_json} class _Table(GbqConnector): @@ -1401,3 +1428,59 @@ def create(self, dataset_id): self.client.create_dataset(dataset) except self.http_error as ex: self.process_http_error(ex) + + +def create_user_agent( + user_agent: Optional[str] = None, rfc9110_delimiter: bool = False +) -> str: + """Creates a user agent string. + + The legacy format of our the user agent string was: `product-x.y.z` (where x, + y, and z are the major, minor, and micro version numbers). + + Users are able to prepend this string with their own user agent identifier + to render something similar to ` pandas-x.y.z`. + + The legacy format used a hyphen to separate the product from the product + version which differs slightly from the format recommended by RFC9110, which is: + `product/x.y.z`. To produce a user agent more in line with the RFC, set + rfc9110_delimiter to True. This setting does not depend on whether a + user_agent is also supplied. + + Reference: + https://www.rfc-editor.org/info/rfc9110 + + Args: + user_agent (Optional[str]): User agent string. + + rfc9110_delimiter (Optional[bool]): Sets delimiter to a hyphen or a slash. + Default is False, meaning a hyphen will be used. + + Returns (str): + Customized user agent string. + + Deprecation Warning: + In a future major release, the default delimiter will be changed to + a `/` in accordance with RFC9110. + """ + import pandas as pd + + if rfc9110_delimiter: + delimiter = "/" + else: + warnings.warn( + "In a future major release, the default delimiter will be " + "changed to a `/` in accordance with RFC9110.", + PendingDeprecationWarning, + stacklevel=2, + ) + delimiter = "-" + + identity = f"pandas{delimiter}{pd.__version__}" + + if user_agent is None: + user_agent = identity + else: + user_agent = f"{user_agent} {identity}" + + return user_agent diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index 45e474b2..567899df 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -15,6 +15,8 @@ from pandas_gbq import exceptions import pandas_gbq.schema +import pandas_gbq.schema.bigquery +import pandas_gbq.schema.pandas_to_bigquery def encode_chunk(dataframe): @@ -214,11 +216,9 @@ def load_csv_from_file( This method is needed for writing with google-cloud-bigquery versions that don't implment load_table_from_dataframe with the CSV serialization format. """ - if schema is None: - schema = pandas_gbq.schema.generate_bq_schema(dataframe) - - schema = pandas_gbq.schema.remove_policy_tags(schema) - bq_schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) + bq_schema = pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields( + dataframe, schema + ) def load_chunk(chunk, job_config): try: diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema/__init__.py similarity index 85% rename from pandas_gbq/schema.py rename to pandas_gbq/schema/__init__.py index b60fdeda..350a1d2e 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema/__init__.py @@ -92,37 +92,6 @@ def schema_is_subset(schema_remote, schema_local): return all(field in fields_remote for field in fields_local) -def generate_bq_schema(dataframe, default_type="STRING"): - """Given a passed dataframe, generate the associated Google BigQuery schema. - - Arguments: - dataframe (pandas.DataFrame): D - default_type : string - The default big query type in case the type of the column - does not exist in the schema. - """ - - # If you update this mapping, also update the table at - # `docs/source/writing.rst`. - type_mapping = { - "i": "INTEGER", - "b": "BOOLEAN", - "f": "FLOAT", - "O": "STRING", - "S": "STRING", - "U": "STRING", - "M": "TIMESTAMP", - } - - fields = [] - for column_name, dtype in dataframe.dtypes.items(): - fields.append( - {"name": column_name, "type": type_mapping.get(dtype.kind, default_type)} - ) - - return {"fields": fields} - - def update_schema(schema_old, schema_new): """ Given an old BigQuery schema, update it with a new one. diff --git a/pandas_gbq/schema/bigquery.py b/pandas_gbq/schema/bigquery.py new file mode 100644 index 00000000..0de21978 --- /dev/null +++ b/pandas_gbq/schema/bigquery.py @@ -0,0 +1,44 @@ +# Copyright (c) 2019 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import collections + +import google.cloud.bigquery + + +def to_schema_fields(schema): + """Coerce `schema` to a list of schema field instances. + + Args: + schema(Sequence[Union[ \ + :class:`~google.cloud.bigquery.schema.SchemaField`, \ + Mapping[str, Any] \ + ]]): + Table schema to convert. If some items are passed as mappings, + their content must be compatible with + :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`. + + Returns: + Sequence[:class:`~google.cloud.bigquery.schema.SchemaField`] + + Raises: + Exception: If ``schema`` is not a sequence, or if any item in the + sequence is not a :class:`~google.cloud.bigquery.schema.SchemaField` + instance or a compatible mapping representation of the field. + """ + for field in schema: + if not isinstance( + field, (google.cloud.bigquery.SchemaField, collections.abc.Mapping) + ): + raise ValueError( + "Schema items must either be fields or compatible " + "mapping representations." + ) + + return [ + field + if isinstance(field, google.cloud.bigquery.SchemaField) + else google.cloud.bigquery.SchemaField.from_api_repr(field) + for field in schema + ] diff --git a/pandas_gbq/schema/pandas_to_bigquery.py b/pandas_gbq/schema/pandas_to_bigquery.py new file mode 100644 index 00000000..5a979a12 --- /dev/null +++ b/pandas_gbq/schema/pandas_to_bigquery.py @@ -0,0 +1,218 @@ +# Copyright (c) 2019 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import collections.abc +import datetime +from typing import Optional, Tuple +import warnings + +import db_dtypes +from google.cloud.bigquery import schema +import pandas +import pyarrow + +import pandas_gbq.core.pandas +import pandas_gbq.schema.bigquery +import pandas_gbq.schema.pyarrow_to_bigquery + +try: + # _BaseGeometry is used to detect shapely objects in `bq_to_arrow_array` + from shapely.geometry.base import BaseGeometry as _BaseGeometry # type: ignore +except ImportError: + # No shapely, use NoneType for _BaseGeometry as a placeholder. + _BaseGeometry = type(None) + + +# If you update this mapping, also update the table at +# `docs/source/writing.rst`. +_PANDAS_DTYPE_TO_BQ = { + "bool": "BOOLEAN", + "datetime64[ns, UTC]": "TIMESTAMP", + "datetime64[ns]": "DATETIME", + "float32": "FLOAT", + "float64": "FLOAT", + "int8": "INTEGER", + "int16": "INTEGER", + "int32": "INTEGER", + "int64": "INTEGER", + "uint8": "INTEGER", + "uint16": "INTEGER", + "uint32": "INTEGER", + "geometry": "GEOGRAPHY", + db_dtypes.DateDtype.name: "DATE", + db_dtypes.TimeDtype.name: "TIME", + # TODO(tswast): Add support for JSON. +} + + +def dataframe_to_bigquery_fields( + dataframe, + override_bigquery_fields=None, + default_type="STRING", + index=False, +) -> Tuple[schema.SchemaField]: + """Convert a pandas DataFrame schema to a BigQuery schema. + + Args: + dataframe (pandas.DataFrame): + DataFrame for which the client determines the BigQuery schema. + override_bigquery_fields (Sequence[Union[ \ + :class:`~google.cloud.bigquery.schema.SchemaField`, \ + Mapping[str, Any] \ + ]]): + A BigQuery schema. Use this argument to override the autodetected + type for some or all of the DataFrame columns. + + Returns: + Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]: + The automatically determined schema. Returns None if the type of + any column cannot be determined. + """ + if override_bigquery_fields: + override_bigquery_fields = pandas_gbq.schema.bigquery.to_schema_fields( + override_bigquery_fields + ) + override_fields_by_name = { + field.name: field for field in override_bigquery_fields + } + override_fields_unused = set(override_fields_by_name.keys()) + else: + override_fields_by_name = {} + override_fields_unused = set() + + bq_schema_out = [] + unknown_type_fields = [] + + # TODO(tswast): Support index=True in to_gbq. + for column, dtype in pandas_gbq.core.pandas.list_columns_and_indexes( + dataframe, index=index + ): + # Use provided type from schema, if present. + bq_field = override_fields_by_name.get(column) + if bq_field: + bq_schema_out.append(bq_field) + override_fields_unused.discard(bq_field.name) + continue + + # Try to automatically determine the type based on the pandas dtype. + bq_field = dtype_to_bigquery_field(column, dtype) + if bq_field: + bq_schema_out.append(bq_field) + continue + + # Try to automatically determine the type based on a few rows of the data. + values = dataframe.reset_index()[column] + bq_field = values_to_bigquery_field(column, values) + + if bq_field: + bq_schema_out.append(bq_field) + continue + + # Try to automatically determine the type based on the arrow conversion. + try: + arrow_value = pyarrow.array(values) + bq_field = ( + pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field( + column, arrow_value.type + ) + ) + + if bq_field: + bq_schema_out.append(bq_field) + continue + except pyarrow.lib.ArrowInvalid: + # TODO(tswast): Better error message if conversion to arrow fails. + pass + + # Unknown field type. + bq_field = schema.SchemaField(column, default_type) + bq_schema_out.append(bq_field) + unknown_type_fields.append(bq_field) + + # Catch any schema mismatch. The developer explicitly asked to serialize a + # column, but it was not found. + if override_fields_unused: + raise ValueError( + "Provided BigQuery fields contain field(s) not present in DataFrame: {}".format( + override_fields_unused + ) + ) + + # If schema detection was not successful for all columns, also try with + # pyarrow, if available. + if unknown_type_fields: + msg = "Could not determine the type of columns: {}".format( + ", ".join(field.name for field in unknown_type_fields) + ) + warnings.warn(msg) + + return tuple(bq_schema_out) + + +def dtype_to_bigquery_field(name, dtype) -> Optional[schema.SchemaField]: + bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) + + if bq_type is not None: + return schema.SchemaField(name, bq_type) + + if hasattr(pandas, "ArrowDtype") and isinstance(dtype, pandas.ArrowDtype): + return pandas_gbq.schema.pyarrow_to_bigquery.arrow_type_to_bigquery_field( + name, dtype.pyarrow_dtype + ) + + return None + + +def value_to_bigquery_field(name, value) -> Optional[schema.SchemaField]: + if isinstance(value, str): + return schema.SchemaField(name, "STRING") + + # For timezone-naive datetimes, the later pyarrow conversion to try and + # learn the type add a timezone to such datetimes, causing them to be + # recognized as TIMESTAMP type. We thus additionally check the actual data + # to see if we need to overrule that and choose DATETIME instead. + # + # See: https://github.com/googleapis/python-bigquery/issues/985 + # and https://github.com/googleapis/python-bigquery/pull/1061 + # and https://github.com/googleapis/python-bigquery-pandas/issues/450 + if isinstance(value, datetime.datetime): + if value.tzinfo is not None: + return schema.SchemaField(name, "TIMESTAMP") + else: + return schema.SchemaField(name, "DATETIME") + + if _BaseGeometry is not None and isinstance(value, _BaseGeometry): + return schema.SchemaField(name, "GEOGRAPHY") + + return None + + +def values_to_bigquery_field(name, values) -> Optional[schema.SchemaField]: + value = pandas_gbq.core.pandas.first_valid(values) + + # All NULL, type not determinable. + if value is None: + return None + + field = value_to_bigquery_field(name, value) + if field is not None: + return field + + if isinstance(value, str): + return schema.SchemaField(name, "STRING") + + # Check plain ARRAY values here. Let STRUCT get determined by pyarrow, + # which can examine more values to determine all keys. + if isinstance(value, collections.abc.Iterable) and not isinstance( + value, collections.abc.Mapping + ): + # It could be that this value contains all None or is empty, so get the + # first non-None value we can find. + valid_item = pandas_gbq.core.pandas.first_array_valid(values) + field = value_to_bigquery_field(name, valid_item) + + if field is not None: + return schema.SchemaField(name, field.field_type, mode="REPEATED") + + return None diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py new file mode 100644 index 00000000..c63559eb --- /dev/null +++ b/pandas_gbq/schema/pyarrow_to_bigquery.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from typing import Optional, cast + +from google.cloud.bigquery import schema +import pyarrow +import pyarrow.types + +_ARROW_SCALAR_IDS_TO_BQ = { + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes + pyarrow.bool_().id: "BOOLEAN", + pyarrow.int8().id: "INTEGER", + pyarrow.int16().id: "INTEGER", + pyarrow.int32().id: "INTEGER", + pyarrow.int64().id: "INTEGER", + pyarrow.uint8().id: "INTEGER", + pyarrow.uint16().id: "INTEGER", + pyarrow.uint32().id: "INTEGER", + pyarrow.uint64().id: "INTEGER", + pyarrow.float16().id: "FLOAT", + pyarrow.float32().id: "FLOAT", + pyarrow.float64().id: "FLOAT", + pyarrow.time32("ms").id: "TIME", + pyarrow.time64("ns").id: "TIME", + pyarrow.timestamp("ns").id: "TIMESTAMP", + pyarrow.date32().id: "DATE", + pyarrow.date64().id: "DATETIME", # because millisecond resolution + pyarrow.binary().id: "BYTES", + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + pyarrow.large_string().id: "STRING", + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", +} + + +def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: + detected_type = _ARROW_SCALAR_IDS_TO_BQ.get(type_.id, None) + if detected_type is not None: + return schema.SchemaField(name, detected_type) + + if pyarrow.types.is_list(type_): + return arrow_list_type_to_bigquery(name, type_) + + if pyarrow.types.is_struct(type_): + inner_fields: list[pyarrow.Field] = [] + struct_type = cast(pyarrow.StructType, type_) + for field_index in range(struct_type.num_fields): + field = struct_type[field_index] + inner_fields.append(arrow_type_to_bigquery_field(field.name, field.type)) + + return schema.SchemaField(name, "RECORD", fields=inner_fields) + + return None + + +def arrow_list_type_to_bigquery(name, type_) -> Optional[schema.SchemaField]: + inner_field = arrow_type_to_bigquery_field(name, type_.value_type) + if inner_field is None: + return None + + return schema.SchemaField( + name, inner_field.field_type, mode="REPEATED", fields=inner_field.fields + ) diff --git a/pandas_gbq/version.py b/pandas_gbq/version.py index 73bc39ae..ec3eb08c 100644 --- a/pandas_gbq/version.py +++ b/pandas_gbq/version.py @@ -2,4 +2,4 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. -__version__ = "0.23.2" +__version__ = "0.24.0" diff --git a/setup.py b/setup.py index df793e59..10d97733 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ "db-dtypes >=1.0.4,<2.0.0", "numpy >=1.18.1", "pandas >=1.1.4", - "pyarrow >=3.0.0", + "pyarrow >=4.0.0", "pydata-google-auth >=1.5.0", # Note: google-api-core and google-auth are also included via transitive # dependency on google-cloud-bigquery, but this library also uses them @@ -42,7 +42,8 @@ "bqstorage": [ "google-cloud-bigquery-storage >=2.16.2, <3.0.0dev", ], - "tqdm": "tqdm>=4.23.0", + "tqdm": ["tqdm>=4.23.0"], + "geopandas": ["geopandas>=0.9.0", "Shapely>=1.8.4"], } # Setup boilerplate below this line. diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt index e551d17e..8d6ef4f4 100644 --- a/testing/constraints-3.8.txt +++ b/testing/constraints-3.8.txt @@ -7,6 +7,7 @@ # Then this file should have foo==1.14.0 # protobuf==3.19.5 db-dtypes==1.0.4 +geopandas==0.9.0 google-api-core==2.10.2 google-auth==2.13.0 google-auth-oauthlib==0.7.0 @@ -14,7 +15,8 @@ google-cloud-bigquery==3.4.2 google-cloud-bigquery-storage==2.16.2 numpy==1.18.1 pandas==1.1.4 -pyarrow==3.0.0 +pyarrow==4.0.0 pydata-google-auth==1.5.0 +Shapely==1.8.4 tqdm==4.23.0 -packaging==22.0.0 \ No newline at end of file +packaging==22.0.0 diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 2e7245d5..6352fbd7 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -10,6 +10,7 @@ import db_dtypes import pandas import pandas.testing +import pyarrow import pytest pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") @@ -125,6 +126,37 @@ def test_series_round_trip( ) DATAFRAME_ROUND_TRIPS = [ + # Ensure that a BOOLEAN column can be written with bool, boolean, and + # object dtypes. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/105 + pytest.param( + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "bool_col": pandas.Series( + [True, False, True], + dtype="bool", + ), + "boolean_col": pandas.Series( + [None, True, False], + dtype="boolean", + ), + "object_col": pandas.Series( + [False, None, True], + dtype="object", + ), + } + ), + table_schema=[ + {"name": "bool_col", "type": "BOOLEAN"}, + {"name": "boolean_col", "type": "BOOLEAN"}, + {"name": "object_col", "type": "BOOLEAN"}, + ], + api_methods={"load_csv", "load_parquet"}, + ), + id="boolean", + ), # Ensure that a DATE column can be written with datetime64[ns] dtype # data. See: # https://github.com/googleapis/python-bigquery-pandas/issues/362 @@ -176,6 +208,96 @@ def test_series_round_trip( {"name": "date_col", "type": "DATE"}, ], ), + # Loading an INTEGER column should work for any integer dtype. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/616 + pytest.param( + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "object": pandas.Series( + [None, 1, -2], + dtype="object", + ), + "nullable_int64": pandas.Series( + [3, None, -4], + dtype="Int64", + ), + "int8": pandas.Series( + [5, -6, 7], + dtype="int8", + ), + "int16": pandas.Series( + [-8, 9, -10], + dtype="int16", + ), + "int32": pandas.Series( + [11, -12, 13], + dtype="int32", + ), + "int64": pandas.Series( + [-14, 15, -16], + dtype="int64", + ), + "uint8": pandas.Series( + [0, 1, 2], + dtype="uint8", + ), + "uint16": pandas.Series( + [3, 4, 5], + dtype="uint16", + ), + "uint32": pandas.Series( + [6, 7, 8], + dtype="uint32", + ), + } + ), + expected_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "object": pandas.Series( + [None, 1, -2], + dtype="Int64", + ), + "nullable_int64": pandas.Series( + [3, None, -4], + dtype="Int64", + ), + "int8": pandas.Series( + [5, -6, 7], + dtype="Int64", + ), + "int16": pandas.Series( + [-8, 9, -10], + dtype="Int64", + ), + "int32": pandas.Series( + [11, -12, 13], + dtype="Int64", + ), + "int64": pandas.Series( + [-14, 15, -16], + dtype="Int64", + ), + "uint8": pandas.Series( + [0, 1, 2], + dtype="Int64", + ), + "uint16": pandas.Series( + [3, 4, 5], + dtype="Int64", + ), + "uint32": pandas.Series( + [6, 7, 8], + dtype="Int64", + ), + } + ), + api_methods={"load_csv", "load_parquet"}, + ), + id="integer", + ), # Loading a NUMERIC column should work for floating point objects. See: # https://github.com/googleapis/python-bigquery-pandas/issues/421 DataFrameRoundTripTestCase( @@ -240,6 +362,133 @@ def test_series_round_trip( ), id="issue365-extreme-datetimes", ), + pytest.param( + # Load STRUCT and ARRAY using either object column or ArrowDtype. + # See: https://github.com/googleapis/python-bigquery-pandas/issues/452 + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "object_struct": pandas.Series( + [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}], + dtype="object", + ), + # Array of DATETIME requires inspection into list elements. + # See: + # https://github.com/googleapis/python-bigquery/pull/1061 + "object_array_datetime": pandas.Series( + [[], [datetime.datetime(1998, 9, 4, 12, 0, 0)], []], + dtype="object", + ), + "object_array_of_struct": pandas.Series( + [[], [{"test": "str4"}], []], dtype="object" + ), + "arrow_struct": pandas.Series( + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("version", pyarrow.int64()), + ("project", pyarrow.string()), + ] + ) + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + "arrow_array": pandas.Series( + [[1, 2, 3], None, [4, 5, 6]], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.int64()), + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + "arrow_array_of_struct": pandas.Series( + [ + [{"test": "str5"}], + None, + [{"test": "str6"}, {"test": "str7"}], + ], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + }, + ), + expected_df=pandas.DataFrame( + { + "row_num": [0, 1, 2], + "object_struct": pandas.Series( + [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}], + dtype=pandas.ArrowDtype( + pyarrow.struct([("test", pyarrow.string())]), + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + # Array of DATETIME requires inspection into list elements. + # See: + # https://github.com/googleapis/python-bigquery/pull/1061 + "object_array_datetime": pandas.Series( + [[], [datetime.datetime(1998, 9, 4, 12, 0, 0)], []], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.timestamp("us"))) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + "object_array_of_struct": pandas.Series( + [[], [{"test": "str4"}], []], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + "arrow_struct": pandas.Series( + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ], + dtype=pandas.ArrowDtype( + pyarrow.struct( + [ + ("version", pyarrow.int64()), + ("project", pyarrow.string()), + ] + ) + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + "arrow_array": pandas.Series( + [[1, 2, 3], [], [4, 5, 6]], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.int64()), + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + "arrow_array_of_struct": pandas.Series( + [[{"test": "str5"}], [], [{"test": "str6"}, {"test": "str7"}]], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])), + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + }, + ), + api_methods={"load_parquet"}, + ), + id="struct", + ), ] @@ -264,13 +513,20 @@ def test_dataframe_round_trip_with_table_schema( method_under_test( input_df, table_id, table_schema=table_schema, api_method=api_method ) - round_trip = read_gbq( - table_id, - dtypes=dict(zip(expected_df.columns, expected_df.dtypes)), - # BigQuery Storage API is required to avoid out-of-bound due to extra - # day from rounding error which was fixed in google-cloud-bigquery - # 2.6.0. https://github.com/googleapis/python-bigquery/pull/402 - use_bqstorage_api=True, + round_trip = ( + read_gbq( + table_id, + dtypes=dict(zip(expected_df.columns, expected_df.dtypes)), + # BigQuery Storage API is required to avoid out-of-bound due to extra + # day from rounding error which was fixed in google-cloud-bigquery + # 2.6.0. https://github.com/googleapis/python-bigquery/pull/402 + use_bqstorage_api=True, + ) + .set_index("row_num") + .sort_index() + ) + + # TODO(tswast): Support writing index columns if to_gbq(index=True). + pandas.testing.assert_frame_equal( + expected_df.set_index("row_num").sort_index(), round_trip ) - round_trip.sort_values("row_num", inplace=True) - pandas.testing.assert_frame_equal(expected_df, round_trip) diff --git a/tests/unit/schema/__init__.py b/tests/unit/schema/__init__.py new file mode 100644 index 00000000..02d26e8e --- /dev/null +++ b/tests/unit/schema/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. diff --git a/tests/unit/schema/test_pandas_to_bigquery.py b/tests/unit/schema/test_pandas_to_bigquery.py new file mode 100644 index 00000000..924ce1ee --- /dev/null +++ b/tests/unit/schema/test_pandas_to_bigquery.py @@ -0,0 +1,156 @@ +# Copyright (c) 2019 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import collections +import datetime +import operator + +from google.cloud.bigquery import schema +import pandas +import pytest + + +@pytest.fixture +def module_under_test(): + from pandas_gbq.schema import pandas_to_bigquery + + return pandas_to_bigquery + + +def test_dataframe_to_bigquery_fields_w_named_index(module_under_test): + df_data = collections.OrderedDict( + [ + ("str_column", ["hello", "world"]), + ("int_column", [42, 8]), + ("bool_column", [True, False]), + ] + ) + index = pandas.Index(["a", "b"], name="str_index") + dataframe = pandas.DataFrame(df_data, index=index) + + returned_schema = module_under_test.dataframe_to_bigquery_fields( + dataframe, [], index=True + ) + + expected_schema = ( + schema.SchemaField("str_index", "STRING", "NULLABLE"), + schema.SchemaField("str_column", "STRING", "NULLABLE"), + schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"), + ) + assert returned_schema == expected_schema + + +def test_dataframe_to_bigquery_fields_w_multiindex(module_under_test): + df_data = collections.OrderedDict( + [ + ("str_column", ["hello", "world"]), + ("int_column", [42, 8]), + ("bool_column", [True, False]), + ] + ) + index = pandas.MultiIndex.from_tuples( + [ + ("a", 0, datetime.datetime(1999, 12, 31, 23, 59, 59, 999999)), + ("a", 0, datetime.datetime(2000, 1, 1, 0, 0, 0)), + ], + names=["str_index", "int_index", "dt_index"], + ) + dataframe = pandas.DataFrame(df_data, index=index) + + returned_schema = module_under_test.dataframe_to_bigquery_fields( + dataframe, [], index=True + ) + + expected_schema = ( + schema.SchemaField("str_index", "STRING", "NULLABLE"), + schema.SchemaField("int_index", "INTEGER", "NULLABLE"), + schema.SchemaField("dt_index", "DATETIME", "NULLABLE"), + schema.SchemaField("str_column", "STRING", "NULLABLE"), + schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"), + ) + assert returned_schema == expected_schema + + +def test_dataframe_to_bigquery_fields_w_bq_schema(module_under_test): + df_data = collections.OrderedDict( + [ + ("str_column", ["hello", "world"]), + ("int_column", [42, 8]), + ("bool_column", [True, False]), + ] + ) + dataframe = pandas.DataFrame(df_data) + + dict_schema = [ + {"name": "str_column", "type": "STRING", "mode": "NULLABLE"}, + {"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"}, + ] + + returned_schema = module_under_test.dataframe_to_bigquery_fields( + dataframe, dict_schema + ) + + expected_schema = ( + schema.SchemaField("str_column", "STRING", "NULLABLE"), + schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("bool_column", "BOOL", "REQUIRED"), + ) + assert returned_schema == expected_schema + + +def test_dataframe_to_bigquery_fields_fallback_needed_w_pyarrow(module_under_test): + dataframe = pandas.DataFrame( + data=[ + {"id": 10, "status": "FOO", "created_at": datetime.date(2019, 5, 10)}, + {"id": 20, "status": "BAR", "created_at": datetime.date(2018, 9, 12)}, + ] + ) + + detected_schema = module_under_test.dataframe_to_bigquery_fields( + dataframe, override_bigquery_fields=[] + ) + expected_schema = ( + schema.SchemaField("id", "INTEGER", mode="NULLABLE"), + schema.SchemaField("status", "STRING", mode="NULLABLE"), + schema.SchemaField("created_at", "DATE", mode="NULLABLE"), + ) + by_name = operator.attrgetter("name") + assert sorted(detected_schema, key=by_name) == sorted(expected_schema, key=by_name) + + +def test_dataframe_to_bigquery_fields_w_extra_fields(module_under_test): + with pytest.raises(ValueError) as exc_context: + module_under_test.dataframe_to_bigquery_fields( + pandas.DataFrame(), + override_bigquery_fields=(schema.SchemaField("not_in_df", "STRING"),), + ) + message = str(exc_context.value) + assert ( + "Provided BigQuery fields contain field(s) not present in DataFrame:" in message + ) + assert "not_in_df" in message + + +def test_dataframe_to_bigquery_fields_geography(module_under_test): + geopandas = pytest.importorskip("geopandas") + from shapely import wkt + + df = geopandas.GeoDataFrame( + pandas.DataFrame( + dict( + name=["foo", "bar"], + geo1=[None, None], + geo2=[None, wkt.loads("Point(1 1)")], + ) + ), + geometry="geo1", + ) + bq_schema = module_under_test.dataframe_to_bigquery_fields(df, []) + assert bq_schema == ( + schema.SchemaField("name", "STRING"), + schema.SchemaField("geo1", "GEOGRAPHY"), + schema.SchemaField("geo2", "GEOGRAPHY"), + ) diff --git a/tests/unit/schema/test_pyarrow_to_bigquery.py b/tests/unit/schema/test_pyarrow_to_bigquery.py new file mode 100644 index 00000000..9a20e342 --- /dev/null +++ b/tests/unit/schema/test_pyarrow_to_bigquery.py @@ -0,0 +1,25 @@ +# Copyright (c) 2024 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import pyarrow + +from pandas_gbq.schema import pyarrow_to_bigquery + + +def test_arrow_type_to_bigquery_field_unknown(): + # Default types should be picked at a higher layer. + assert ( + pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null()) + is None + ) + + +def test_arrow_type_to_bigquery_field_list_of_unknown(): + # Default types should be picked at a higher layer. + assert ( + pyarrow_to_bigquery.arrow_type_to_bigquery_field( + "test_name", pyarrow.list_(pyarrow.null()) + ) + is None + ) diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 92a09a3f..75574820 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -635,7 +635,15 @@ def test_read_gbq_wo_verbose_w_new_pandas_no_warnings(monkeypatch, recwarn): mock.PropertyMock(return_value=False), ) gbq.read_gbq("SELECT 1", project_id="my-project", dialect="standard") - assert len(recwarn) == 0 + # This test was intended to check for warnings about the deprecation of + # the argument `verbose` (which was removed from gbq (~v0.4.0) and + # pandas (~v0.23.0). (See https://github.com/googleapis/python-bigquery-pandas/pull/158/files) + # This test should not fail upon seeing a warning in regards to a pending + # deprecation related to rfc9110 delimiters. + # TODO this and related tests have likely outlived their usefulness, + # consider removing. + for warning in recwarn.list: + assert "delimiter" in str(warning.message) def test_read_gbq_with_old_bq_raises_importerror(monkeypatch): @@ -660,7 +668,15 @@ def test_read_gbq_with_verbose_old_pandas_no_warnings(monkeypatch, recwarn): dialect="standard", verbose=True, ) - assert len(recwarn) == 0 + # This test was intended to check for warnings about the deprecation of + # the argument `verbose` (which was removed from gbq (~v0.4.0) and + # pandas (~v0.23.0). (See https://github.com/googleapis/python-bigquery-pandas/pull/158/files) + # This test should not fail upon seeing a warning in regards to a pending + # deprecation related to rfc9110 delimiters. + # TODO this and related tests have likely outlived their usefulness, + # consider removing. + for warning in recwarn.list: + assert "delimiter" in str(warning.message) def test_read_gbq_with_private_raises_notimplmentederror(): diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index 45c73533..bb611781 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -165,11 +165,8 @@ def test_load_csv_from_file_generates_schema(mock_bigquery_client): assert sent_schema[2].field_type == "FLOAT" assert sent_schema[3].name == "string_col" assert sent_schema[3].field_type == "STRING" - # TODO: Disambiguate TIMESTAMP from DATETIME based on if column is - # localized or at least use field type from table metadata. See: - # https://github.com/googleapis/python-bigquery-pandas/issues/450 assert sent_schema[4].name == "datetime_col" - assert sent_schema[4].field_type == "TIMESTAMP" + assert sent_schema[4].field_type == "DATETIME" assert sent_schema[5].name == "timestamp_col" assert sent_schema[5].field_type == "TIMESTAMP" diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 7fdc616c..48e8862a 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -7,14 +7,12 @@ import google.cloud.bigquery import pandas +import pyarrow import pytest - -@pytest.fixture -def module_under_test(): - import pandas_gbq.schema - - return pandas_gbq.schema +import pandas_gbq +import pandas_gbq.gbq +import pandas_gbq.schema @pytest.mark.parametrize( @@ -45,17 +43,15 @@ def module_under_test(): ), ], ) -def test_schema_is_subset_passes_if_subset( - module_under_test, original_fields, dataframe_fields -): +def test_schema_is_subset_passes_if_subset(original_fields, dataframe_fields): # Issue #24 schema_is_subset indicates whether the schema of the # dataframe is a subset of the schema of the bigquery table table_schema = {"fields": original_fields} tested_schema = {"fields": dataframe_fields} - assert module_under_test.schema_is_subset(table_schema, tested_schema) + assert pandas_gbq.schema.schema_is_subset(table_schema, tested_schema) -def test_schema_is_subset_fails_if_not_subset(module_under_test): +def test_schema_is_subset_fails_if_not_subset(): table_schema = { "fields": [ {"name": "A", "type": "FLOAT"}, @@ -66,12 +62,17 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): tested_schema = { "fields": [{"name": "A", "type": "FLOAT"}, {"name": "C", "type": "FLOAT"}] } - assert not module_under_test.schema_is_subset(table_schema, tested_schema) + assert not pandas_gbq.schema.schema_is_subset(table_schema, tested_schema) @pytest.mark.parametrize( "dataframe,expected_schema", [ + pytest.param( + pandas.DataFrame(data={"col1": [object()]}), + {"fields": [{"name": "col1", "type": "STRING"}]}, + id="default-type-fails-pyarrow-conversion", + ), ( pandas.DataFrame(data={"col1": [1, 2, 3]}), {"fields": [{"name": "col1", "type": "INTEGER"}]}, @@ -88,13 +89,39 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): pandas.DataFrame(data={"col1": ["hello", "world"]}), {"fields": [{"name": "col1", "type": "STRING"}]}, ), - ( - pandas.DataFrame(data={"col1": [datetime.datetime.now()]}), - {"fields": [{"name": "col1", "type": "TIMESTAMP"}]}, + pytest.param( + # No time zone -> DATETIME, + # Time zone -> TIMESTAMP + # See: https://github.com/googleapis/python-bigquery-pandas/issues/450 + pandas.DataFrame( + data={ + "object1": pandas.Series([datetime.datetime.now()], dtype="object"), + "object2": pandas.Series( + [datetime.datetime.now(datetime.timezone.utc)], dtype="object" + ), + "datetime1": pandas.Series( + [datetime.datetime.now()], dtype="datetime64[ns]" + ), + "datetime2": pandas.Series( + [datetime.datetime.now(datetime.timezone.utc)], + dtype="datetime64[ns, UTC]", + ), + } + ), + { + "fields": [ + {"name": "object1", "type": "DATETIME"}, + {"name": "object2", "type": "TIMESTAMP"}, + {"name": "datetime1", "type": "DATETIME"}, + {"name": "datetime2", "type": "TIMESTAMP"}, + ] + }, + id="issue450-datetime", ), ( pandas.DataFrame( data={ + "col0": [datetime.datetime.now(datetime.timezone.utc)], "col1": [datetime.datetime.now()], "col2": ["hello"], "col3": [3.14], @@ -104,7 +131,8 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): ), { "fields": [ - {"name": "col1", "type": "TIMESTAMP"}, + {"name": "col0", "type": "TIMESTAMP"}, + {"name": "col1", "type": "DATETIME"}, {"name": "col2", "type": "STRING"}, {"name": "col3", "type": "FLOAT"}, {"name": "col4", "type": "BOOLEAN"}, @@ -112,10 +140,83 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): ] }, ), + pytest.param( + # uint8, which is the result from get_dummies, should be INTEGER. + # https://github.com/googleapis/python-bigquery-pandas/issues/616 + pandas.DataFrame({"col": [0, 1]}, dtype="uint8"), + {"fields": [{"name": "col", "type": "INTEGER"}]}, + id="issue616-uint8", + ), + pytest.param( + # object column containing dictionaries should load to STRUCT. + # https://github.com/googleapis/python-bigquery-pandas/issues/452 + pandas.DataFrame( + { + "my_struct": pandas.Series( + [{"test": "str1"}, {"test": "str2"}, {"test": "str3"}], + dtype="object", + ), + } + ), + { + "fields": [ + { + "name": "my_struct", + "type": "RECORD", + "fields": [ + {"name": "test", "type": "STRING", "mode": "NULLABLE"} + ], + } + ] + }, + id="issue452-struct", + ), + pytest.param( + pandas.DataFrame( + { + "object": pandas.Series([[], ["abc"], []], dtype="object"), + "list": pandas.Series( + [[], [1, 2, 3], []], + dtype=pandas.ArrowDtype(pyarrow.list_(pyarrow.int64())) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + "list_of_struct": pandas.Series( + [[], [{"test": "abc"}], []], + dtype=pandas.ArrowDtype( + pyarrow.list_(pyarrow.struct([("test", pyarrow.string())])) + ) + if hasattr(pandas, "ArrowDtype") + else "object", + ), + } + ), + { + "fields": [ + {"name": "object", "type": "STRING", "mode": "REPEATED"}, + {"name": "list", "type": "INTEGER", "mode": "REPEATED"}, + { + "name": "list_of_struct", + "type": "RECORD", + "mode": "REPEATED", + "fields": [ + {"name": "test", "type": "STRING", "mode": "NULLABLE"}, + ], + }, + ], + }, + id="array", + ), ], ) -def test_generate_bq_schema(module_under_test, dataframe, expected_schema): - schema = module_under_test.generate_bq_schema(dataframe) +def test_generate_bq_schema(dataframe, expected_schema): + schema = pandas_gbq.gbq._generate_bq_schema(dataframe) + + # NULLABLE is the default mode. + for field in expected_schema["fields"]: + if "mode" not in field: + field["mode"] = "NULLABLE" + assert schema == expected_schema @@ -156,8 +257,8 @@ def test_generate_bq_schema(module_under_test, dataframe, expected_schema): ), ], ) -def test_update_schema(module_under_test, schema_old, schema_new, expected_output): - output = module_under_test.update_schema(schema_old, schema_new) +def test_update_schema(schema_old, schema_new, expected_output): + output = pandas_gbq.schema.update_schema(schema_old, schema_new) assert output == expected_output diff --git a/tests/unit/test_to_gbq.py b/tests/unit/test_to_gbq.py index 23b7c9bd..60ea8025 100644 --- a/tests/unit/test_to_gbq.py +++ b/tests/unit/test_to_gbq.py @@ -4,6 +4,7 @@ import google.api_core.exceptions import google.cloud.bigquery +import pandas as pd from pandas import DataFrame import pytest @@ -158,3 +159,27 @@ def test_to_gbq_with_if_exists_unknown(): project_id="myproj", if_exists="unknown", ) + + +@pytest.mark.parametrize( + "user_agent,rfc9110_delimiter,expected", + [ + ( + "test_user_agent/2.0.42", + False, + f"test_user_agent/2.0.42 pandas-{pd.__version__}", + ), + (None, False, f"pandas-{pd.__version__}"), + ( + "test_user_agent/2.0.42", + True, + f"test_user_agent/2.0.42 pandas/{pd.__version__}", + ), + (None, True, f"pandas/{pd.__version__}"), + ], +) +def test_create_user_agent(user_agent, rfc9110_delimiter, expected): + from pandas_gbq.gbq import create_user_agent + + result = create_user_agent(user_agent, rfc9110_delimiter) + assert result == expected