From baefadf62119741b9c9d9bb755f98d8adcebf9f9 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 17 Apr 2025 16:22:24 -0700 Subject: [PATCH 01/12] chore: blob.display to support width and height params (#1628) * chore: blob.display to support width and height params * wording --- bigframes/operations/blob.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 54078557ed..f0e25852cb 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -225,7 +225,14 @@ def write_url(self) -> bigframes.series.Series: ops.JSONValue(json_path="$.access_urls.write_url") ) - def display(self, n: int = 3, *, content_type: str = ""): + def display( + self, + n: int = 3, + *, + content_type: str = "", + width: Optional[int] = None, + height: Optional[int] = None, + ): """Display the blob content in the IPython Notebook environment. Only works for image type now. .. note:: @@ -234,6 +241,8 @@ def display(self, n: int = 3, *, content_type: str = ""): Args: n (int, default 3): number of sample blob objects to display. content_type (str, default ""): content type of the blob. If unset, use the blob metadata of the storage. Possible values are "image", "audio" and "video". + width (int or None, default None): width in pixels that the image/video are constrained to. If unset, use the image/video's original size or ratio. No-op for other content types. + height (int or None, default None): height in pixels that the image/video are constrained to. If unset, use the image/video's original size or ratio. No-op for other content types. """ # col name doesn't matter here. Rename to avoid column name conflicts df = bigframes.series.Series(self._block).rename("blob_col").to_frame() @@ -259,13 +268,17 @@ def display_single_url( content_type = cast(str, content_type).casefold() if content_type.startswith("image"): - ipy_display.display(ipy_display.Image(url=read_url)) + ipy_display.display( + ipy_display.Image(url=read_url, width=width, height=height) + ) elif content_type.startswith("audio"): # using url somehow doesn't work with audios response = requests.get(read_url) ipy_display.display(ipy_display.Audio(response.content)) elif content_type.startswith("video"): - ipy_display.display(ipy_display.Video(read_url)) + ipy_display.display( + ipy_display.Video(read_url, width=width, height=height) + ) else: # display as raw data response = requests.get(read_url) ipy_display.display(response.content) From 233347aca0ac55b2407e0f49430bf13536986e25 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 18 Apr 2025 10:38:10 -0700 Subject: [PATCH 02/12] feat: enable local json string validations (#1614) Fixes internal issue 401055880 --- bigframes/core/local_data.py | 53 ++++++++++++++++++++++++------ tests/system/small/test_session.py | 53 ++++++++++++++++++++++++++++-- 2 files changed, 94 insertions(+), 12 deletions(-) diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 70b1741af7..e5c67fcf43 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -26,11 +26,12 @@ import geopandas # type: ignore import numpy as np -import pandas +import pandas as pd import pyarrow as pa import pyarrow.parquet # type: ignore import bigframes.core.schema as schemata +import bigframes.core.utils as utils import bigframes.dtypes @@ -58,15 +59,12 @@ class ManagedArrowTable: schema: schemata.ArraySchema = dataclasses.field(hash=False) id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4) - def __post_init__(self): - self.validate() - @functools.cached_property def metadata(self) -> LocalTableMetadata: return LocalTableMetadata.from_arrow(self.data) @classmethod - def from_pandas(cls, dataframe: pandas.DataFrame) -> ManagedArrowTable: + def from_pandas(cls, dataframe: pd.DataFrame) -> ManagedArrowTable: """Creates managed table from pandas. Ignores index, col names must be unique strings""" columns: list[pa.ChunkedArray] = [] fields: list[schemata.SchemaItem] = [] @@ -78,9 +76,11 @@ def from_pandas(cls, dataframe: pandas.DataFrame) -> ManagedArrowTable: columns.append(new_arr) fields.append(schemata.SchemaItem(str(name), bf_type)) - return ManagedArrowTable( + mat = ManagedArrowTable( pa.table(columns, names=column_names), schemata.ArraySchema(tuple(fields)) ) + mat.validate(include_content=True) + return mat @classmethod def from_pyarrow(self, table: pa.Table) -> ManagedArrowTable: @@ -91,10 +91,12 @@ def from_pyarrow(self, table: pa.Table) -> ManagedArrowTable: columns.append(new_arr) fields.append(schemata.SchemaItem(name, bf_type)) - return ManagedArrowTable( + mat = ManagedArrowTable( pa.table(columns, names=table.column_names), schemata.ArraySchema(tuple(fields)), ) + mat.validate() + return mat def to_parquet( self, @@ -140,8 +142,7 @@ def itertuples( ): yield tuple(row_dict.values()) - def validate(self): - # TODO: Content-based validation for some datatypes (eg json, wkt, list) where logical domain is smaller than pyarrow type + def validate(self, include_content: bool = False): for bf_field, arrow_field in zip(self.schema.items, self.data.schema): expected_arrow_type = _get_managed_storage_type(bf_field.dtype) arrow_type = arrow_field.type @@ -150,6 +151,38 @@ def validate(self): f"Field {bf_field} has arrow array type: {arrow_type}, expected type: {expected_arrow_type}" ) + if include_content: + for batch in self.data.to_batches(): + for field in self.schema.items: + _validate_content(batch.column(field.column), field.dtype) + + +def _validate_content(array: pa.Array, dtype: bigframes.dtypes.Dtype): + """ + Recursively validates the content of a PyArrow Array based on the + expected BigFrames dtype, focusing on complex types like JSON, structs, + and arrays where the Arrow type alone isn't sufficient. + """ + # TODO: validate GEO data context. + if dtype == bigframes.dtypes.JSON_DTYPE: + values = array.to_pandas() + for data in values: + # Skip scalar null values to avoid `TypeError` from json.load. + if not utils.is_list_like(data) and pd.isna(data): + continue + try: + # Attempts JSON parsing. + json.loads(data) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON format found: {data!r}") from e + elif bigframes.dtypes.is_struct_like(dtype): + for field_name, dtype in bigframes.dtypes.get_struct_fields(dtype).items(): + _validate_content(array.field(field_name), dtype) + elif bigframes.dtypes.is_array_like(dtype): + return _validate_content( + array.flatten(), bigframes.dtypes.get_array_inner_type(dtype) + ) + # Sequential iterator, but could split into batches and leverage parallelism for speed def _iter_table( @@ -226,7 +259,7 @@ def _( def _adapt_pandas_series( - series: pandas.Series, + series: pd.Series, ) -> tuple[Union[pa.ChunkedArray, pa.Array], bigframes.dtypes.Dtype]: # Mostly rely on pyarrow conversions, but have to convert geo without its help. if series.dtype == bigframes.dtypes.GEO_DTYPE: diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 24edc91c93..4e0f8376a9 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -962,8 +962,8 @@ def test_read_pandas_json_series(session, write_engine): json_data = [ "1", None, - '["1","3","5"]', - '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', + '[1,"3",null,{"a":null}]', + '{"a":1,"b":["x","y"],"c":{"x":[],"y":null,"z":false}}', ] expected_series = pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE) @@ -975,6 +975,28 @@ def test_read_pandas_json_series(session, write_engine): ) +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_inline"), + pytest.param("bigquery_load"), + pytest.param("bigquery_streaming"), + ], +) +def test_read_pandas_json_series_w_invalid_json(session, write_engine): + json_data = [ + "False", # Should be "false" + ] + pd_s = pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE) + + with pytest.raises( + ValueError, + match="Invalid JSON format found", + ): + session.read_pandas(pd_s, write_engine=write_engine) + + @pytest.mark.parametrize( ("write_engine"), [ @@ -1056,6 +1078,33 @@ def test_read_pandas_w_nested_json(session, write_engine): pd.testing.assert_series_equal(bq_s, pd_s) +@pytest.mark.parametrize( + ("write_engine"), + [ + pytest.param("default"), + pytest.param("bigquery_inline"), + pytest.param("bigquery_load"), + pytest.param("bigquery_streaming"), + ], +) +def test_read_pandas_w_nested_invalid_json(session, write_engine): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + data = [ + [{"json_field": "NULL"}], # Should be "null" + ] + pa_array = pa.array(data, type=pa.list_(pa.struct([("json_field", pa.string())]))) + pd_s = pd.Series( + arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("json_field", bigframes.dtypes.JSON_ARROW_TYPE)])) + ), + ) + + with pytest.raises(ValueError, match="Invalid JSON format found"): + session.read_pandas(pd_s, write_engine=write_engine) + + @pytest.mark.parametrize( ("write_engine"), [ From f4e5b26b7b7b00ef807987c4b9c5fded56ad883f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 18 Apr 2025 14:49:06 -0700 Subject: [PATCH 03/12] feat: enhance `read_csv` `index_col` parameter support (#1631) This PR expands the functionality of the `index_col` parameter in the `read_csv` method. New capabilities include: 1. **Multi-column Indexing:** `index_col` now accepts an iterable of strings (column names) to create a MultiIndex. (Fixes internal issue 338089659) 2. **Integer Indexing:** Support for a single integer index or an iterable of integers (column positions) is also explicitly included/verified. (Fixes internal issue 404530013) 3. **Pandas Compatibility:** Adds tests to ensure that the behavior of `index_col` when set to `False`, `None`, or `True` aligns with standard Pandas behavior. (Fixes internal issue 338400133) --- bigframes/session/__init__.py | 30 +------- .../session/_io/bigquery/read_gbq_table.py | 38 +++++++++- bigframes/session/loader.py | 12 ++- tests/system/small/test_session.py | 73 +++++++++++++------ tests/unit/session/test_session.py | 5 -- 5 files changed, 100 insertions(+), 58 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9d45019fc5..c77d11ddde 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -961,37 +961,13 @@ def _read_csv_w_bigquery_engine( f"{constants.FEEDBACK_LINK}" ) - # TODO(b/338089659): Looks like we can relax this 1 column - # restriction if we check the contents of an iterable are strings - # not integers. - if ( - # Empty tuples, None, and False are allowed and falsey. - index_col - and not isinstance(index_col, bigframes.enums.DefaultIndexKind) - and not isinstance(index_col, str) - ): - raise NotImplementedError( - "BigQuery engine only supports a single column name for `index_col`, " - f"got: {repr(index_col)}. {constants.FEEDBACK_LINK}" - ) + if index_col is True: + raise ValueError("The value of index_col couldn't be 'True'") # None and False cannot be passed to read_gbq. - # TODO(b/338400133): When index_col is None, we should be using the - # first column of the CSV as the index to be compatible with the - # pandas engine. According to the pandas docs, only "False" - # indicates a default sequential index. - if not index_col: + if index_col is None or index_col is False: index_col = () - index_col = typing.cast( - Union[ - Sequence[str], # Falsey values - bigframes.enums.DefaultIndexKind, - str, - ], - index_col, - ) - # usecols should only be an iterable of strings (column names) for use as columns in read_gbq. columns: Tuple[Any, ...] = tuple() if usecols is not None: diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 9fa97cb6e1..34183b22bc 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -230,7 +230,11 @@ def _is_table_clustered_or_partitioned( def get_index_cols( table: bigquery.table.Table, - index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind, + index_col: Iterable[str] + | str + | Iterable[int] + | int + | bigframes.enums.DefaultIndexKind, ) -> List[str]: """ If we can get a total ordering from the table, such as via primary key @@ -240,6 +244,8 @@ def get_index_cols( # Transform index_col -> index_cols so we have a variable that is # always a list of column names (possibly empty). + schema_len = len(table.schema) + index_cols: List[str] = [] if isinstance(index_col, bigframes.enums.DefaultIndexKind): if index_col == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64: # User has explicity asked for a default, sequential index. @@ -255,9 +261,35 @@ def get_index_cols( f"Got unexpected index_col {repr(index_col)}. {constants.FEEDBACK_LINK}" ) elif isinstance(index_col, str): - index_cols: List[str] = [index_col] + index_cols = [index_col] + elif isinstance(index_col, int): + if not 0 <= index_col < schema_len: + raise ValueError( + f"Integer index {index_col} is out of bounds " + f"for table with {schema_len} columns (must be >= 0 and < {schema_len})." + ) + index_cols = [table.schema[index_col].name] + elif isinstance(index_col, Iterable): + for item in index_col: + if isinstance(item, str): + index_cols.append(item) + elif isinstance(item, int): + if not 0 <= item < schema_len: + raise ValueError( + f"Integer index {item} is out of bounds " + f"for table with {schema_len} columns (must be >= 0 and < {schema_len})." + ) + index_cols.append(table.schema[item].name) + else: + raise TypeError( + "If index_col is an iterable, it must contain either strings " + "(column names) or integers (column positions)." + ) else: - index_cols = list(index_col) + raise TypeError( + f"Unsupported type for index_col: {type(index_col).__name__}. Expected" + "an integer, an string, an iterable of strings, or an iterable of integers." + ) # If the isn't an index selected, use the primary keys of the table as the # index. If there are no primary keys, we'll return an empty list. diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index bdcada6364..b053ed3c90 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -289,7 +289,11 @@ def read_gbq_table( self, query: str, *, - index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), + index_col: Iterable[str] + | str + | Iterable[int] + | int + | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), max_results: Optional[int] = None, api_name: str = "read_gbq_table", @@ -516,7 +520,11 @@ def read_bigquery_load_job( filepath_or_buffer: str | IO["bytes"], *, job_config: bigquery.LoadJobConfig, - index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = (), + index_col: Iterable[str] + | str + | Iterable[int] + | int + | bigframes.enums.DefaultIndexKind = (), columns: Iterable[str] = (), ) -> dataframe.DataFrame: # Need to create session table beforehand diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 4e0f8376a9..c7e7fa3573 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1216,55 +1216,86 @@ def test_read_csv_for_local_file_w_sep(session, df_and_local_csv, sep): pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) -def test_read_csv_w_index_col_false(session, df_and_local_csv): +@pytest.mark.parametrize( + "index_col", + [ + pytest.param(None, id="none"), + pytest.param(False, id="false"), + pytest.param([], id="empty_list"), + ], +) +def test_read_csv_for_index_col_w_false(session, df_and_local_csv, index_col): # Compares results for pandas and bigframes engines scalars_df, path = df_and_local_csv with open(path, "rb") as buffer: bf_df = session.read_csv( buffer, engine="bigquery", - index_col=False, + index_col=index_col, ) with open(path, "rb") as buffer: # Convert default pandas dtypes to match BigQuery DataFrames dtypes. pd_df = session.read_csv( - buffer, index_col=False, dtype=scalars_df.dtypes.to_dict() + buffer, index_col=index_col, dtype=scalars_df.dtypes.to_dict() ) - assert bf_df.shape[0] == scalars_df.shape[0] - assert bf_df.shape[0] == pd_df.shape[0] - - # We use a default index because of index_col=False, so the previous index - # column is just loaded as a column. - assert len(bf_df.columns) == len(scalars_df.columns) + 1 - assert len(bf_df.columns) == len(pd_df.columns) + assert bf_df.shape == pd_df.shape # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs # (b/280889935) or guarantee row ordering. bf_df = bf_df.set_index("rowindex").sort_index() pd_df = pd_df.set_index("rowindex") - - pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) -def test_read_csv_w_index_col_column_label(session, df_and_gcs_csv): - scalars_df, path = df_and_gcs_csv - bf_df = session.read_csv(path, engine="bigquery", index_col="rowindex") +@pytest.mark.parametrize( + "index_col", + [ + pytest.param("rowindex", id="single_str"), + pytest.param(["rowindex", "bool_col"], id="multi_str"), + pytest.param(0, id="single_int"), + pytest.param([0, 2], id="multi_int"), + pytest.param([0, "bool_col"], id="mix_types"), + ], +) +def test_read_csv_for_index_col(session, df_and_gcs_csv, index_col): + scalars_pandas_df, path = df_and_gcs_csv + bf_df = session.read_csv(path, engine="bigquery", index_col=index_col) # Convert default pandas dtypes to match BigQuery DataFrames dtypes. pd_df = session.read_csv( - path, index_col="rowindex", dtype=scalars_df.dtypes.to_dict() + path, index_col=index_col, dtype=scalars_pandas_df.dtypes.to_dict() ) - assert bf_df.shape == scalars_df.shape assert bf_df.shape == pd_df.shape + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) - assert len(bf_df.columns) == len(scalars_df.columns) - assert len(bf_df.columns) == len(pd_df.columns) - pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) +@pytest.mark.parametrize( + ("index_col", "error_type", "error_msg"), + [ + pytest.param( + True, ValueError, "The value of index_col couldn't be 'True'", id="true" + ), + pytest.param(100, ValueError, "out of bounds", id="single_int"), + pytest.param([0, 200], ValueError, "out of bounds", id="multi_int"), + pytest.param( + [0.1], TypeError, "it must contain either strings", id="invalid_iterable" + ), + pytest.param( + 3.14, TypeError, "Unsupported type for index_col", id="unsupported_type" + ), + ], +) +def test_read_csv_raises_error_for_invalid_index_col( + session, df_and_gcs_csv, index_col, error_type, error_msg +): + _, path = df_and_gcs_csv + with pytest.raises( + error_type, + match=error_msg, + ): + session.read_csv(path, engine="bigquery", index_col=index_col) @pytest.mark.parametrize( diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 490ffc4108..22b439a38b 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -118,11 +118,6 @@ "BigQuery engine does not support these arguments", id="with_dtype", ), - pytest.param( - {"engine": "bigquery", "index_col": 5}, - "BigQuery engine only supports a single column name for `index_col`.", - id="with_index_col_not_str", - ), pytest.param( {"engine": "bigquery", "usecols": [1, 2]}, "BigQuery engine only supports an iterable of strings for `usecols`.", From 55e57f1fb722cb9d2f52e79543610fa2963405bc Mon Sep 17 00:00:00 2001 From: jialuoo Date: Fri, 18 Apr 2025 16:05:16 -0700 Subject: [PATCH 04/12] chore: improve improper naming in function session (#1635) --- bigframes/functions/_function_session.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index ec0e977782..6d2f0dbd57 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -886,7 +886,7 @@ def wrapper(func): signature, input_types, output_type # type: ignore ) - remote_function_client = _function_client.FunctionClient( + managed_function_client = _function_client.FunctionClient( dataset_ref.project, bq_location, dataset_ref.dataset_id, @@ -905,7 +905,7 @@ def wrapper(func): self._try_delattr(func, "is_row_processor") self._try_delattr(func, "ibis_node") - bq_function_name = remote_function_client.provision_bq_managed_function( + bq_function_name = managed_function_client.provision_bq_managed_function( func=func, input_types=tuple( third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) @@ -942,7 +942,7 @@ def wrapper(func): signature=(ibis_signature.input_types, ibis_signature.output_type), ) # type: ignore func.bigframes_bigquery_function = ( - remote_function_client.get_remote_function_fully_qualilfied_name( + managed_function_client.get_remote_function_fully_qualilfied_name( bq_function_name ) ) From 93f44a8daaeb5c2997afee1ef7b2f52fb98ffa4e Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 21 Apr 2025 12:01:58 -0700 Subject: [PATCH 05/12] chore: update Multimodal snippets for display size (#1640) * chore: update Multimodal snippets for display size * wording --- samples/snippets/multimodal_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index e5236317e2..85e118d671 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -48,8 +48,8 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: df_image["updated"] = df_image["image"].blob.updated() df_image - # Filter images and display, you can also display audio and video types - df_image[df_image["author"] == "alice"]["image"].blob.display() + # Filter images and display, you can also display audio and video types. Use width/height parameters to constrain window sizes. + df_image[df_image["author"] == "alice"]["image"].blob.display(width=400) # [END bigquery_dataframes_multimodal_dataframe_merge] # [START bigquery_dataframes_multimodal_dataframe_image_transform] From 087a32ae024161d3b9306c007193242df83ef22c Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 21 Apr 2025 12:04:52 -0700 Subject: [PATCH 06/12] chore: add notebook showcasing `LinearRegression` on large data (#1613) * chore: add notebook to support medium blog on linear regression * change the modeling problem to o3 level prediction * train the model in partial ordering mode * fix typo * use 2025 instead of 2023 in the copyright header --- .../bq_dataframes_ml_linear_regression.ipynb | 40 +- ..._dataframes_ml_linear_regression_big.ipynb | 1064 +++++++++++++++++ noxfile.py | 1 + 3 files changed, 1087 insertions(+), 18 deletions(-) create mode 100644 notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb diff --git a/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb b/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb index fad2f00b31..4123dd0e1c 100644 --- a/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb +++ b/notebooks/ml/bq_dataframes_ml_linear_regression.ipynb @@ -35,24 +35,24 @@ "\n", "\n", " \n", " \n", " \n", "
\n", - " \n", + " \n", " \"Colab Run in Colab\n", " \n", " \n", - " \n", + " \n", " \"GitHub\n", " View on GitHub\n", " \n", " \n", - " \n", + " \n", " \"Vertex\n", " Open in Vertex AI Workbench\n", " \n", " \n", - " \n", + " \n", " \"BQ\n", " Open in BQ Studio\n", " \n", @@ -79,7 +79,7 @@ "source": [ "## Overview\n", "\n", - "Use this notebook to learn how to train a linear regression model by using BigQuery DataFrames ML. BigQuery DataFrames ML provides a provides a scikit-learn-like API for ML powered by the BigQuery engine.\n", + "Use this notebook to learn how to train a linear regression model using BigQuery DataFrames ML. BigQuery DataFrames ML provides a provides a scikit-learn-like API for ML powered by the BigQuery engine.\n", "\n", "This example is adapted from the [BQML linear regression tutorial](https://cloud.google.com/bigquery-ml/docs/linear-regression-tutorial).\n", "\n", @@ -142,7 +142,10 @@ "source": [ "## Installation\n", "\n", - "Install the following packages, which are required to run this notebook:" + "If you don't have [bigframes](https://pypi.org/project/bigframes/) package already installed, uncomment and execute the following cells to\n", + "\n", + "1. Install the package\n", + "1. Restart the notebook kernel (Jupyter or Colab) to work with the package" ] }, { @@ -153,18 +156,7 @@ }, "outputs": [], "source": [ - "!pip install bigframes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "58707a750154" - }, - "source": [ - "### Colab only\n", - "\n", - "Uncomment and run the following cell to restart the kernel:" + "# !pip install bigframes" ] }, { @@ -749,6 +741,18 @@ "kernelspec": { "display_name": "Python 3", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" } }, "nbformat": 4, diff --git a/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb b/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb new file mode 100644 index 0000000000..0c5106f8f4 --- /dev/null +++ b/notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb @@ -0,0 +1,1064 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "## Train a linear regression model with BigQuery DataFrames ML\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "24743cf4a1e1" + }, + "source": [ + "**_NOTE_**: This notebook has been tested in the following environment:\n", + "\n", + "* Python version = 3.11" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "This notebook demonstrates training a linear regression model on Big Data using BigQuery DataFrames ML. BigQuery DataFrames ML provides a provides a scikit-learn-like API for ML powered by the BigQuery engine.\n", + "\n", + "Learn more about [BigQuery DataFrames](https://cloud.google.com/python/docs/reference/bigframes/latest)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, we use BigQuery DataFrames to create a linear regression model that predicts the levels of Ozone in the atmosphere.\n", + "\n", + "The steps include:\n", + "\n", + "- Creating a DataFrame from the BigQuery table.\n", + "- Cleaning and preparing data using `bigframes.pandas` module.\n", + "- Creating a linear regression model using `bigframes.ml` module.\n", + "- Saving the ML model to BigQuery for future use.\n", + "\n", + "\n", + "Let's formally define our problem as: **Train a linear regression model to predict the level of ozone in the atmosphere given the measurements of other constituents and properties of the atmosphere.**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "In this tutorial we are going to use the [`bigquery-public-data.epa_historical_air_quality`](https://console.cloud.google.com/marketplace/product/epa/historical-air-quality) dataset. To quote the description of the dataset:\n", + "\n", + "\"The United States Environmental Protection Agency (EPA) protects both public health and the environment by establishing the standards for national air quality. The EPA provides annual summary data as well as hourly and daily data in the categories of criteria gases, particulates, meteorological, and toxics.\"\n", + "\n", + "There are several tables capturing data about the constituents of the atmosphere, see them in the [BigQuery cloud console](https://pantheon.corp.google.com/bigquery?p=bigquery-public-data&d=epa_historical_air_quality&page=dataset). Most tables carry 10's of GBs of data, but that is not an issue with BigQuery DataFrames as the data is efficiently processed at BigQuery without transferring them to the client." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* BigQuery (compute)\n", + "* BigQuery ML\n", + "\n", + "Learn about [BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models)\n", + "and [BigQuery ML pricing](https://cloud.google.com/bigquery/pricing#bqml),\n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "If you don't have [bigframes](https://pypi.org/project/bigframes/) package already installed, uncomment and execute the following cells to\n", + "\n", + "1. Install the package\n", + "1. Restart the notebook kernel (Jupyter or Colab) to work with the package" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9O0Ka4W2MNF3" + }, + "outputs": [], + "source": [ + "# !pip install bigframes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "\n", + "# import IPython\n", + "#\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "Complete the tasks in this section to set up your environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oDfTjfACBvJk" + }, + "source": [ + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the BigQuery API](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "If you don't know your project ID, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"\" # @param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Set the BigQuery location\n", + "\n", + "You can also change the `LOCATION` variable used by BigQuery. Learn more about [BigQuery locations](https://cloud.google.com/bigquery/docs/locations#supported_locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eF-Twtc4XGem" + }, + "outputs": [], + "source": [ + "LOCATION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Set up APIs, IAM permissions and Authentication\n", + "\n", + "Follow the instructions at https://cloud.google.com/bigquery/docs/use-bigquery-dataframes#permissions.\n", + "\n", + "Depending on your notebook environment, you might have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**Vertex AI Workbench**\n", + "\n", + "Do nothing, you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**Local JupyterLab instance**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login\n", + "# ! gcloud auth application-default login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**Colab**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "### Set BigQuery DataFrames options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NPPMuw2PXGeo" + }, + "outputs": [], + "source": [ + "# NOTE: The project option is not required in all environments.\n", + "# On BigQuery Studio, the project ID is automatically detected.\n", + "bpd.options.bigquery.project = PROJECT_ID\n", + "\n", + "# NOTE: The location option is not required.\n", + "# It defaults to the location of the first table or query\n", + "# passed to read_gbq(). For APIs where a location can't be\n", + "# auto-detected, the location defaults to the \"US\" location.\n", + "bpd.options.bigquery.location = LOCATION\n", + "\n", + "# NOTE: For a machine learning model the order of the data is\n", + "# not important. So let's relax the ordering_mode to accept\n", + "# partial ordering. This allows BigQuery DataFrames to run cost\n", + "# and performance optimized jobs at the BigQuery engine.\n", + "bpd.options.bigquery.ordering_mode = \"partial\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D21CoOlfFTYI" + }, + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bpd.close_session()`. After that, you can reuse `bpd.options.bigquery.location` to specify another location." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EMAqR37AfLS" + }, + "source": [ + "## Read data in BigQuery tables as DataFrame\n", + "\n", + "Let's read the tables in the dataset to construct a BigQuery DataFrames DataFrame. We will combine measurements of various parameters of the atmosphere from multiple tables to represent a consolidated dataframe to use for our model training and prediction. We have daily and hourly versions of the data available, but since we want to create a model that is dynamic so that it can capture the variance throughout the day, we would choose the hourly version.\n", + "\n", + "Note that we would use the pandas APIs as we normally would on the BigQuery DataFrames DataFrame, but calculations happen in the BigQuery query engine instead of the local environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = \"bigquery-public-data.epa_historical_air_quality\"\n", + "hourly_summary_tables = [\n", + " \"co_hourly_summary\",\n", + " \"hap_hourly_summary\",\n", + " \"no2_hourly_summary\",\n", + " \"nonoxnoy_hourly_summary\",\n", + " \"o3_hourly_summary\",\n", + " \"pm10_hourly_summary\",\n", + " \"pm25_frm_hourly_summary\",\n", + " \"pm25_nonfrm_hourly_summary\",\n", + " \"pm25_speciation_hourly_summary\",\n", + " \"pressure_hourly_summary\",\n", + " \"rh_and_dp_hourly_summary\",\n", + " \"so2_hourly_summary\",\n", + " \"temperature_hourly_summary\",\n", + " \"voc_hourly_summary\",\n", + " \"wind_hourly_summary\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's pick index columns - to identify a measurement of the atmospheric parameter, param column - to identify which param the measurement pertains to, and value column - the column containing the measurement itself." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index_columns = [\"state_name\", \"county_name\", \"site_num\", \"date_local\", \"time_local\"]\n", + "param_column = \"parameter_name\"\n", + "value_column = \"sample_measurement\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's observe how much data each table contains:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for table in hourly_summary_tables:\n", + " # get the bigframes global session\n", + " bigframes_session = bpd.get_global_session()\n", + "\n", + " # get the bigquery table info\n", + " table_info = bigframes_session.bqclient.get_table(f\"{dataset}.{table}\")\n", + "\n", + " # read the table as a dataframe\n", + " df = bpd.read_gbq(f\"{dataset}.{table}\")\n", + "\n", + " # print metadata about the table\n", + " print(\n", + " f\"{table}: \"\n", + " f\"{round(table_info.num_bytes/1_000_000_000, 1)} GB, \"\n", + " f\"{round(table_info.num_rows/1_000_000, 1)} million rows, \"\n", + " f\"{df[param_column].nunique()} params\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's be mindful that the rows in each table may contain duplicates, which may introdude bias in any model trained on the raw data. We will make sure to drop the duplicates when we use the data for model training.\n", + "\n", + "Since we want to predict ozone level, we obviously pick the `o3` table. Let's also pick the tables about other gases - `co`, `no2` and `so2`. Let's also pick `pressure` and `temperature` tables as they seem fundamental indicators for the atmosphere. Note that each of these tables capture measurements for a single parameter (i.e. the column `parameter_name` has a single unique value).\n", + "\n", + "We are also interested in the nonoxny and wind tables, but they capture multiple parameters (i.e. the column `parameter_name` has a more than one unique values). We will include their measurements in later step, as they require extar processing to separate out the measurements for the individual parameters.\n", + "\n", + "We skip the other tables in this exercise for either they have very little or fragmented data or they seem uninteresting for the purpose of predicting ozone levels. You can take this as a separate exercise to train a linear regression model by including those parameters. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's maintain an array of dtaframes, one for each parameter, and eventually combine them into a single dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params_dfs = []" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's process the tables with single parameter measurements first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EDAaIwHpQCDZ" + }, + "outputs": [], + "source": [ + "table_param_dict = {\n", + " \"co_hourly_summary\" : \"co\",\n", + " \"no2_hourly_summary\" : \"no2\",\n", + " \"o3_hourly_summary\" : \"o3\",\n", + " \"pressure_hourly_summary\" : \"pressure\",\n", + " \"so2_hourly_summary\" : \"so2\",\n", + " \"temperature_hourly_summary\" : \"temperature\",\n", + "}\n", + "\n", + "for table, param in table_param_dict.items():\n", + " param_df = bpd.read_gbq(\n", + " f\"{dataset}.{table}\",\n", + " columns=index_columns + [value_column]\n", + " )\n", + " param_df = param_df\\\n", + " .sort_values(index_columns)\\\n", + " .drop_duplicates(index_columns)\\\n", + " .set_index(index_columns)\\\n", + " .rename(columns={value_column : param})\n", + " params_dfs.append(param_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The nonoxnoy table captures measurements for 3 parameters. Let's analyze how many instances of each parameter it contains." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nonoxnoy_table = f\"{dataset}.nonoxnoy_hourly_summary\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bpd.read_gbq(nonoxnoy_table, columns=[param_column]).value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that the NOy data is significantly sparse as compared to NO and NOx, so we skip that and include NO and NOx data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "no_df = bpd.read_gbq(\n", + " nonoxnoy_table,\n", + " columns=index_columns + [value_column],\n", + " filters=[(param_column, \"==\", \"Nitric oxide (NO)\")]\n", + ")\n", + "no_df = no_df\\\n", + " .sort_values(index_columns)\\\n", + " .drop_duplicates(index_columns)\\\n", + " .set_index(index_columns)\\\n", + " .rename(columns={value_column: \"no_\"})\n", + "params_dfs.append(no_df)\n", + "\n", + "nox_df = bpd.read_gbq(\n", + " nonoxnoy_table,\n", + " columns=index_columns + [value_column],\n", + " filters=[(param_column, \"==\", \"Oxides of nitrogen (NOx)\")]\n", + ")\n", + "nox_df = nox_df\\\n", + " .sort_values(index_columns)\\\n", + " .drop_duplicates(index_columns)\\\n", + " .set_index(index_columns)\\\n", + " .rename(columns={value_column: \"nox\"})\n", + "params_dfs.append(nox_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The wind table captures measurements for 2 parameters. Let's analyze how many instances of each parameter it contains." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wind_table = f\"{dataset}.wind_hourly_summary\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bpd.read_gbq(wind_table, columns=[param_column]).value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's include the data for wind speed and wind direction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wind_speed_df = bpd.read_gbq(\n", + " wind_table,\n", + " columns=index_columns + [value_column],\n", + " filters=[(param_column, \"==\", \"Wind Speed - Resultant\")]\n", + ")\n", + "wind_speed_df = wind_speed_df\\\n", + " .sort_values(index_columns)\\\n", + " .drop_duplicates(index_columns)\\\n", + " .set_index(index_columns)\\\n", + " .rename(columns={value_column: \"wind_speed\"})\n", + "params_dfs.append(wind_speed_df)\n", + "\n", + "wind_dir_df = bpd.read_gbq(\n", + " wind_table,\n", + " columns=index_columns + [value_column],\n", + " filters=[(param_column, \"==\", \"Wind Direction - Resultant\")]\n", + ")\n", + "wind_dir_df = wind_dir_df\\\n", + " .sort_values(index_columns)\\\n", + " .drop_duplicates(index_columns)\\\n", + " .set_index(index_columns)\\\n", + " .rename(columns={value_column: \"wind_dir\"})\n", + "params_dfs.append(wind_dir_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's observe each individual parameter and number of data points for each parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for param_df in params_dfs:\n", + " print(f\"{param_df.columns.values}: {len(param_df)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's combine data from all parameters into a single DataFrame. The measurements for each parameter may not be available for every (state, county, site, date, time) identifier, we will consider only those identifiers for which measurements of all parameters are available. To achieve this we will combine the measurements via \"inner\" join.\n", + "\n", + "We will also materialize this combined data via `cache` method for efficient reuse in the subsequent steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = bpd.concat(params_dfs, axis=1, join=\"inner\").cache()\n", + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rwPLjqW2Ajzh" + }, + "source": [ + "## Clean and prepare data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's temporarily bring the index columns as dataframe columns for further processing on the index values for the purpose of data preparation.\n", + "We will reconstruct the index back at the time of the model training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.reset_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Observe the years from which we have consolidated data so far." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"date_local\"].dt.year.value_counts().sort_index().to_pandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we would train a model from the past data to predict ozone levels for the future data. Let's define the cut-off year as 2020. We will pretend that the data before 2020 has known ozone levels, and the 2020 onwards the ozone levels are unknown, which we will predict using our model.\n", + "\n", + "We should further separate the known data into training and test sets. The model would be trained on the training set and then evaluated on the test set to make sure the model generalizes beyond the training data. We could use [train_test_split](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.model_selection#bigframes_ml_model_selection_train_test_split) method to randomly split the training and test data, but we leave that for you to try out. In this exercise, let's split based on another cutoff year 2017 - the known data before 2017 would be training data and 2017 onwards would be the test data. This way we stay with the idea that the model is trained on past data and then used to predict the future values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6i6HkFJZa8na" + }, + "outputs": [], + "source": [ + "train_data_filter = (df.date_local.dt.year < 2017)\n", + "test_data_filter = (df.date_local.dt.year >= 2017) & (df.date_local.dt.year < 2020)\n", + "predict_data_filter = (df.date_local.dt.year >= 2020)\n", + "\n", + "df_train = df[train_data_filter].set_index(index_columns)\n", + "df_test = df[test_data_filter].set_index(index_columns)\n", + "df_predict = df[predict_data_filter].set_index(index_columns)\n", + "\n", + "df_train.shape, df_test.shape, df_predict.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M_-0X7NxYK5f" + }, + "source": [ + "Prepare your feature (or input) columns and the target (or output) column for the purpose of model training and evaluation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YKwCW7Nsavap" + }, + "outputs": [], + "source": [ + "X_train = df_train.drop(columns=\"o3\")\n", + "y_train = df_train[\"o3\"]\n", + "\n", + "X_test = df_test.drop(columns=\"o3\")\n", + "y_test = df_test[\"o3\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare the unknown data for prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wej78IDUaRW9" + }, + "outputs": [], + "source": [ + "X_predict = df_predict.drop(columns=\"o3\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fx4lsNqMorJ-" + }, + "source": [ + "## Create the linear regression model\n", + "\n", + "BigQuery DataFrames ML lets you seamlessly transition from exploring data to creating machine learning models through its scikit-learn-like API, `bigframes.ml`. BigQuery DataFrames ML supports several types of [ML models](https://cloud.google.com/python/docs/reference/bigframes/latest#ml-capabilities).\n", + "\n", + "In this notebook, you create a [`LinearRegression`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LinearRegression) model, a type of regression model that generates a continuous value from a linear combination of input features.\n", + "\n", + "When you create a model with BigQuery DataFrames ML, it is saved in an internal location and limited to the BigQuery DataFrames session. However, as you'll see in the next section, you can use `to_gbq` to save the model permanently to your BigQuery project." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EloGtMnverFF" + }, + "source": [ + "### Create the model using `bigframes.ml`\n", + "\n", + "Please note that BigQuery DataFrames ML is backed by BigQuery ML, which uses\n", + "[automatic preprocessing](https://cloud.google.com/bigquery/docs/auto-preprocessing) to encode string values and scale numeric values when you pass the feature columns without transforms.\n", + "\n", + "BigQuery ML also [automatically splits the data for training and evaluation](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-glm#data_split_method), although for datasets with less than 500 rows (such as this one), all rows are used for training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GskyyUQPowBT" + }, + "outputs": [], + "source": [ + "from bigframes.ml.linear_model import LinearRegression\n", + "\n", + "model = LinearRegression()\n", + "\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UGjeMPC2caKK" + }, + "source": [ + "### Score the model\n", + "\n", + "Check how the model performs by using the [`score`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LinearRegression#bigframes_ml_linear_model_LinearRegression_score) method. More information on BigQuery ML model scoring can be found [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#mlevaluate_output)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kGBJKafpo0dl" + }, + "outputs": [], + "source": [ + "# On the training data\n", + "model.score(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# On the test data\n", + "model.score(X_test, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P2lUiZZ_cjri" + }, + "source": [ + "### Predict using the model\n", + "\n", + "Use the model to predict the levels of ozone. The predicted levels are returned in the column `predicted_o3`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bsQ9cmoWo0Ps" + }, + "outputs": [], + "source": [ + "df_pred = model.predict(X_predict)\n", + "df_pred.peek()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GTRdUw-Ro5R1" + }, + "source": [ + "## Save the model in BigQuery\n", + "\n", + "The model is saved locally within this session. You can save the model permanently to BigQuery for use in future sessions, and to make the model sharable with others." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K0mPaoGpcwwy" + }, + "source": [ + "Create a BigQuery dataset to house the model, adding a name for your dataset as the `DATASET_ID` variable:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZSP7gt13QrQt" + }, + "outputs": [], + "source": [ + "DATASET_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "if not DATASET_ID:\n", + " raise ValueError(\"Please define the DATASET_ID\")\n", + "\n", + "client = bpd.get_global_session().bqclient\n", + "dataset = client.create_dataset(DATASET_ID, exists_ok=True)\n", + "print(f\"Dataset {dataset.dataset_id} created.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zqAIWWgJczp-" + }, + "source": [ + "Save the model using the `to_gbq` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QE_GD4Byo_jb" + }, + "outputs": [], + "source": [ + "model.to_gbq(DATASET_ID + \".o3_lr_model\" , replace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f7uHacAy49rT" + }, + "source": [ + "You can view the saved model in the BigQuery console under the dataset you created in the first step. Run the following cell and follow the link to view your BigQuery console:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qDBoiA_0488Z" + }, + "outputs": [], + "source": [ + "print(f'https://console.cloud.google.com/bigquery?ws=!1m5!1m4!5m3!1s{PROJECT_ID}!2s{DATASET_ID}!3so3_lr_model')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G_wjSfXpWTuy" + }, + "source": [ + "# Summary and next steps\n", + "\n", + "You've created a linear regression model using `bigframes.ml`.\n", + "\n", + "Learn more about BigQuery DataFrames in the [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest) and find more sample notebooks in the [GitHub repo](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "# # Delete the BigQuery dataset and associated ML model\n", + "# client.delete_dataset(DATASET_ID, delete_contents=True, not_found_ok=True)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index bf9a435b0f..b29cda7a51 100644 --- a/noxfile.py +++ b/noxfile.py @@ -767,6 +767,7 @@ def notebook(session: nox.Session): # our test infrastructure. "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", # Needs DATASET. "notebooks/ml/bq_dataframes_ml_linear_regression.ipynb", # Needs DATASET_ID. + "notebooks/ml/bq_dataframes_ml_linear_regression_big.ipynb", # Needs DATASET_ID. "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. # TODO(b/332737009): investigate why we get 404 errors, even though # bq_dataframes_llm_code_generation creates a bucket in the sample. From f68b80cce2451a8c8d931a54e0cb69e02f34ce10 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 21 Apr 2025 16:16:20 -0700 Subject: [PATCH 07/12] docs: add code samples in the `udf` API docstring (#1632) --- bigframes/session/__init__.py | 67 +++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c77d11ddde..7229a4641b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1423,7 +1423,7 @@ def udf( packages: Optional[Sequence[str]] = None, ): """Decorator to turn a Python user defined function (udf) into a - BigQuery managed function. + [BigQuery managed user-defined function](https://cloud.google.com/bigquery/docs/user-defined-functions-python). .. note:: The udf must be self-contained, i.e. it must not contain any @@ -1431,9 +1431,70 @@ def udf( body. .. note:: - Please have following IAM roles enabled for you: + Please have BigQuery Data Editor (roles/bigquery.dataEditor) IAM + role enabled for you. - * BigQuery Data Editor (roles/bigquery.dataEditor) + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import datetime + >>> bpd.options.display.progress_bar = None + + Turning an arbitrary python function into a BigQuery managed python udf: + + >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") + >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) + ... def minutes_to_hours(x: int) -> float: + ... return x/60 + + >>> minutes = bpd.Series([0, 30, 60, 90, 120]) + >>> minutes + 0 0 + 1 30 + 2 60 + 3 90 + 4 120 + dtype: Int64 + + >>> hours = minutes.apply(minutes_to_hours) + >>> hours + 0 0.0 + 1 0.5 + 2 1.0 + 3 1.5 + 4 2.0 + dtype: Float64 + + To turn a user defined function with external package dependencies into + a BigQuery managed python udf, you would provide the names of the + packages (optionally with the package version) via `packages` param. + + >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") + >>> @bpd.udf( + ... dataset="bigfranes_testing", + ... name=bq_name, + ... packages=["cryptography"] + ... ) + ... def get_hash(input: str) -> str: + ... from cryptography.fernet import Fernet + ... + ... # handle missing value + ... if input is None: + ... input = "" + ... + ... key = Fernet.generate_key() + ... f = Fernet(key) + ... return f.encrypt(input.encode()).decode() + + >>> names = bpd.Series(["Alice", "Bob"]) + >>> hashes = names.apply(get_hash) + + You can clean-up the BigQuery functions created above using the BigQuery + client from the BigQuery DataFrames session: + + >>> session = bpd.get_global_session() + >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) + >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) Args: input_types (type or sequence(type), Optional): From 3c314c3d328da2d534d542173e85730bedef518e Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 21 Apr 2025 16:51:10 -0700 Subject: [PATCH 08/12] refactor: Unify compile paths with ResultNode (#1636) --- bigframes/core/compile/api.py | 14 ++- bigframes/core/compile/compiled.py | 25 +++-- bigframes/core/compile/compiler.py | 93 ++++++++++++------- .../core/compile/googlesql/expression.py | 2 +- bigframes/core/compile/googlesql/query.py | 19 +++- bigframes/core/nodes.py | 45 ++++++++- bigframes/core/rewrite/__init__.py | 9 +- bigframes/core/rewrite/order.py | 31 ++++++- bigframes/core/rewrite/pruning.py | 83 ++++++++--------- bigframes/core/rewrite/slices.py | 19 +++- 10 files changed, 224 insertions(+), 116 deletions(-) diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 32257c0f98..bb19f92be9 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -23,7 +23,6 @@ if TYPE_CHECKING: import bigframes.core.nodes import bigframes.core.ordering - import bigframes.core.schema class SQLCompiler: @@ -35,8 +34,8 @@ def compile( limit: Optional[int] = None, ) -> str: """Compile node into sql where rows are sorted with ORDER BY.""" - # If we are ordering the query anyways, compiling the slice as a limit is probably a good idea. - return compiler.compile_sql(node, ordered=ordered, limit=limit) + request = compiler.CompileRequest(node, sort_rows=ordered, peek_count=limit) + return compiler.compile_sql(request).sql def compile_raw( self, @@ -45,7 +44,12 @@ def compile_raw( str, Sequence[bigquery.SchemaField], bigframes.core.ordering.RowOrdering ]: """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" - return compiler.compile_raw(node) + request = compiler.CompileRequest( + node, sort_rows=False, materialize_all_order_keys=True + ) + result = compiler.compile_sql(request) + assert result.row_order is not None + return result.sql, result.sql_schema, result.row_order def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): @@ -53,7 +57,7 @@ def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): import bigframes.core.schema node = compiler._replace_unsupported_ops(node) - node, _ = rewrite.pull_up_order(node, order_root=False) + node = rewrite.bake_order(node) ir = compiler.compile_node(node) items = tuple( bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id)) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 6202a34ce2..a79ad9fe55 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -69,23 +69,28 @@ def __init__( def to_sql( self, - *, - order_by: Sequence[OrderingExpression] = (), - limit: Optional[int] = None, - selections: Optional[Sequence[str]] = None, + order_by: Sequence[OrderingExpression], + limit: Optional[int], + selections: tuple[tuple[ex.DerefOp, str], ...], ) -> str: ibis_table = self._to_ibis_expr() # This set of output transforms maybe should be its own output node?? - if ( - order_by - or limit - or (selections and (tuple(selections) != tuple(self.column_ids))) - ): + + selection_strings = tuple((ref.id.sql, name) for ref, name in selections) + + names_preserved = tuple(name for _, name in selections) == tuple( + self.column_ids + ) + is_noop_selection = ( + all((i[0] == i[1] for i in selection_strings)) and names_preserved + ) + + if order_by or limit or not is_noop_selection: sql = ibis_bigquery.Backend().compile(ibis_table) sql = ( bigframes.core.compile.googlesql.Select() .from_(sql) - .select(selections or self.column_ids) + .select(selection_strings) .sql() ) diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 04d3ea1bf9..8ca6cb35a3 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -13,8 +13,10 @@ # limitations under the License. from __future__ import annotations +import dataclasses import functools import typing +from typing import cast, Optional import bigframes_vendored.ibis.backends.bigquery as ibis_bigquery import bigframes_vendored.ibis.expr.api as ibis_api @@ -24,6 +26,7 @@ import pyarrow as pa from bigframes import dtypes, operations +from bigframes.core import expression import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.explode @@ -34,48 +37,58 @@ if typing.TYPE_CHECKING: import bigframes.core - import bigframes.session -def compile_sql( - node: nodes.BigFrameNode, - ordered: bool, - limit: typing.Optional[int] = None, -) -> str: - # later steps might add ids, so snapshot before those steps. - output_ids = node.schema.names - if ordered: - # Need to do this before replacing unsupported ops, as that will rewrite slice ops - node, pulled_up_limit = rewrites.pullup_limit_from_slice(node) - if (pulled_up_limit is not None) and ( - (limit is None) or limit > pulled_up_limit - ): - limit = pulled_up_limit +@dataclasses.dataclass(frozen=True) +class CompileRequest: + node: nodes.BigFrameNode + sort_rows: bool + materialize_all_order_keys: bool = False + peek_count: typing.Optional[int] = None + + +@dataclasses.dataclass(frozen=True) +class CompileResult: + sql: str + sql_schema: typing.Sequence[google.cloud.bigquery.SchemaField] + row_order: Optional[bf_ordering.RowOrdering] - node = _replace_unsupported_ops(node) + +def compile_sql(request: CompileRequest) -> CompileResult: + output_names = tuple((expression.DerefOp(id), id.sql) for id in request.node.ids) + result_node = nodes.ResultNode( + request.node, + output_cols=output_names, + limit=request.peek_count, + ) + if request.sort_rows: + # Can only pullup slice if we are doing ORDER BY in outermost SELECT + # Need to do this before replacing unsupported ops, as that will rewrite slice ops + result_node = rewrites.pull_up_limits(result_node) + result_node = _replace_unsupported_ops(result_node) # prune before pulling up order to avoid unnnecessary row_number() ops - node = rewrites.column_pruning(node) - node, ordering = rewrites.pull_up_order(node, order_root=ordered) - # final pruning to cleanup up any leftovers unused values - node = rewrites.column_pruning(node) - return compile_node(node).to_sql( - order_by=ordering.all_ordering_columns if ordered else (), - limit=limit, - selections=output_ids, + result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) + result_node = rewrites.defer_order( + result_node, output_hidden_row_keys=request.materialize_all_order_keys ) + if request.sort_rows: + result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) + sql = compile_result_node(result_node) + return CompileResult( + sql, result_node.schema.to_bigquery(), result_node.order_by + ) - -def compile_raw( - node: nodes.BigFrameNode, -) -> typing.Tuple[ - str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering -]: - node = _replace_unsupported_ops(node) - node = rewrites.column_pruning(node) - node, ordering = rewrites.pull_up_order(node, order_root=True) - node = rewrites.column_pruning(node) - sql = compile_node(node).to_sql() - return sql, node.schema.to_bigquery(), ordering + ordering: Optional[bf_ordering.RowOrdering] = result_node.order_by + result_node = dataclasses.replace(result_node, order_by=None) + result_node = cast(nodes.ResultNode, rewrites.column_pruning(result_node)) + sql = compile_result_node(result_node) + # Return the ordering iff no extra columns are needed to define the row order + if ordering is not None: + output_order = ( + ordering if ordering.referenced_columns.issubset(result_node.ids) else None + ) + assert (not request.materialize_all_order_keys) or (output_order is not None) + return CompileResult(sql, result_node.schema.to_bigquery(), output_order) def _replace_unsupported_ops(node: nodes.BigFrameNode): @@ -86,6 +99,14 @@ def _replace_unsupported_ops(node: nodes.BigFrameNode): return node +def compile_result_node(root: nodes.ResultNode) -> str: + return compile_node(root.child).to_sql( + order_by=root.order_by.all_ordering_columns if root.order_by else (), + limit=root.limit, + selections=root.output_cols, + ) + + # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) @functools.lru_cache(maxsize=5000) def compile_node(node: nodes.BigFrameNode) -> compiled.UnorderedIR: diff --git a/bigframes/core/compile/googlesql/expression.py b/bigframes/core/compile/googlesql/expression.py index 581ab67718..20d6dbf9a1 100644 --- a/bigframes/core/compile/googlesql/expression.py +++ b/bigframes/core/compile/googlesql/expression.py @@ -25,7 +25,7 @@ * `expression`: Models basic SQL expressions. Extended classes (not part of standard GoogleSQL syntax, but added for convenience): - +i * `ColumnExpression`: Represents column references. * `TableExpression`: Represents table references. * `AliasExpression`: Represents aliased expressions. diff --git a/bigframes/core/compile/googlesql/query.py b/bigframes/core/compile/googlesql/query.py index dfe21ef7b2..e3b7a2c8ca 100644 --- a/bigframes/core/compile/googlesql/query.py +++ b/bigframes/core/compile/googlesql/query.py @@ -63,22 +63,31 @@ class Select(abc.SQLSyntax): def select( self, - columns: typing.Union[typing.Iterable[str], str, None] = None, + columns: typing.Union[ + typing.Iterable[str], typing.Iterable[tuple[str, str]], str, None + ] = None, distinct: bool = False, ) -> Select: if isinstance(columns, str): columns = [columns] self.select_list: typing.List[typing.Union[SelectExpression, SelectAll]] = ( - [ - SelectExpression(expression=expr.ColumnExpression(name=column)) - for column in columns - ] + [self._select_field(column) for column in columns] if columns else [SelectAll(expression=expr.StarExpression())] ) self.distinct = distinct return self + def _select_field(self, field) -> SelectExpression: + if isinstance(field, str): + return SelectExpression(expression=expr.ColumnExpression(name=field)) + + else: + alias = field[1] if (field[0] != field[1]) else None + return SelectExpression( + expression=expr.ColumnExpression(name=field[0]), alias=alias + ) + def from_( self, sources: typing.Union[TABLE_SOURCE_TYPE, typing.Iterable[TABLE_SOURCE_TYPE]], diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 99c8f09bc0..d2c301b4ad 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -36,7 +36,7 @@ from bigframes.core import identifiers, local_data from bigframes.core.bigframe_node import BigFrameNode, COLUMN_SET, Field import bigframes.core.expression as ex -from bigframes.core.ordering import OrderingExpression +from bigframes.core.ordering import OrderingExpression, RowOrdering import bigframes.core.slices as slices import bigframes.core.window_spec as window import bigframes.dtypes @@ -1602,11 +1602,50 @@ def remap_refs( # Introduced during planing/compilation +# TODO: Enforce more strictly that this should never be a child node @dataclasses.dataclass(frozen=True, eq=False) class ResultNode(UnaryNode): - output_names: tuple[str, ...] - order_by: Tuple[OrderingExpression, ...] = () + output_cols: tuple[tuple[ex.DerefOp, str], ...] + order_by: Optional[RowOrdering] = None limit: Optional[int] = None + # TODO: CTE definitions + + @property + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: + return () + + def remap_vars( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ResultNode: + return self + + def remap_refs( + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] + ) -> ResultNode: + output_names = tuple( + (ref.remap_column_refs(mappings), name) for ref, name in self.output_cols + ) + order_by = self.order_by.remap_column_refs(mappings) if self.order_by else None + return dataclasses.replace(self, output_names=output_names, order_by=order_by) # type: ignore + + @property + def consumed_ids(self) -> COLUMN_SET: + out_refs = frozenset(ref.id for ref, _ in self.output_cols) + order_refs = self.order_by.referenced_columns if self.order_by else frozenset() + return out_refs | order_refs + + @property + def row_count(self) -> Optional[int]: + child_count = self.child.row_count + if child_count is None: + return None + if self.limit is None: + return child_count + return min(self.limit, child_count) + + @property + def variables_introduced(self) -> int: + return 0 # Tree operators diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 128cefe94c..555199dcd9 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -15,10 +15,10 @@ from bigframes.core.rewrite.identifiers import remap_variables from bigframes.core.rewrite.implicit_align import try_row_join from bigframes.core.rewrite.legacy_align import legacy_join_as_projection -from bigframes.core.rewrite.order import pull_up_order +from bigframes.core.rewrite.order import bake_order, defer_order from bigframes.core.rewrite.pruning import column_pruning from bigframes.core.rewrite.scan_reduction import try_reduce_to_table_scan -from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice +from bigframes.core.rewrite.slices import pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions from bigframes.core.rewrite.windows import rewrite_range_rolling @@ -27,10 +27,11 @@ "try_row_join", "rewrite_slice", "rewrite_timedelta_expressions", - "pullup_limit_from_slice", + "pull_up_limits", "remap_variables", - "pull_up_order", + "defer_order", "column_pruning", "rewrite_range_rolling", "try_reduce_to_table_scan", + "bake_order", ] diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index cca55092e0..06deba996c 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -15,17 +15,40 @@ import functools from typing import Mapping, Tuple -from bigframes.core import identifiers -import bigframes.core.expression +from bigframes.core import expression, identifiers import bigframes.core.nodes import bigframes.core.ordering import bigframes.core.window_spec -import bigframes.operations from bigframes.operations import aggregations as agg_ops +def defer_order( + root: bigframes.core.nodes.ResultNode, output_hidden_row_keys: bool +) -> bigframes.core.nodes.ResultNode: + new_child, order = _pull_up_order(root.child, order_root=True) + order_by = ( + order.with_ordering_columns(root.order_by.all_ordering_columns) + if root.order_by + else order + ) + if output_hidden_row_keys: + output_names = tuple((expression.DerefOp(id), id.sql) for id in new_child.ids) + else: + output_names = root.output_cols + return dataclasses.replace( + root, output_cols=output_names, child=new_child, order_by=order_by + ) + + +def bake_order( + node: bigframes.core.nodes.BigFrameNode, +) -> bigframes.core.nodes.BigFrameNode: + node, _ = _pull_up_order(node, order_root=False) + return node + + # Makes ordering explicit in window definitions -def pull_up_order( +def _pull_up_order( root: bigframes.core.nodes.BigFrameNode, *, order_root: bool = True, diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py index 5f4990094c..61375cf352 100644 --- a/bigframes/core/rewrite/pruning.py +++ b/bigframes/core/rewrite/pruning.py @@ -15,14 +15,13 @@ import functools from typing import AbstractSet -from bigframes.core import identifiers -import bigframes.core.nodes +from bigframes.core import identifiers, nodes def column_pruning( - root: bigframes.core.nodes.BigFrameNode, -) -> bigframes.core.nodes.BigFrameNode: - return bigframes.core.nodes.top_down(root, prune_columns) + root: nodes.BigFrameNode, +) -> nodes.BigFrameNode: + return nodes.top_down(root, prune_columns) def to_fixed(max_iterations: int = 100): @@ -48,12 +47,14 @@ def wrapper(*args, **kwargs): @to_fixed(max_iterations=100) -def prune_columns(node: bigframes.core.nodes.BigFrameNode): - if isinstance(node, bigframes.core.nodes.SelectionNode): +def prune_columns(node: nodes.BigFrameNode): + if isinstance(node, nodes.SelectionNode): result = prune_selection_child(node) - elif isinstance(node, bigframes.core.nodes.AggregateNode): + elif isinstance(node, nodes.ResultNode): result = node.replace_child(prune_node(node.child, node.consumed_ids)) - elif isinstance(node, bigframes.core.nodes.InNode): + elif isinstance(node, nodes.AggregateNode): + result = node.replace_child(prune_node(node.child, node.consumed_ids)) + elif isinstance(node, nodes.InNode): result = dataclasses.replace( node, right_child=prune_node(node.right_child, frozenset([node.right_col.id])), @@ -64,23 +65,23 @@ def prune_columns(node: bigframes.core.nodes.BigFrameNode): def prune_selection_child( - selection: bigframes.core.nodes.SelectionNode, -) -> bigframes.core.nodes.BigFrameNode: + selection: nodes.SelectionNode, +) -> nodes.BigFrameNode: child = selection.child # Important to check this first if list(selection.ids) == list(child.ids): return child - if isinstance(child, bigframes.core.nodes.SelectionNode): + if isinstance(child, nodes.SelectionNode): return selection.remap_refs( {id: ref.id for ref, id in child.input_output_pairs} ).replace_child(child.child) - elif isinstance(child, bigframes.core.nodes.AdditiveNode): + elif isinstance(child, nodes.AdditiveNode): if not set(field.id for field in child.added_fields) & selection.consumed_ids: return selection.replace_child(child.additive_base) needed_ids = selection.consumed_ids | child.referenced_ids - if isinstance(child, bigframes.core.nodes.ProjectionNode): + if isinstance(child, nodes.ProjectionNode): # Projection expressions are independent, so can be individually removed from the node child = dataclasses.replace( child, @@ -91,75 +92,67 @@ def prune_selection_child( return selection.replace_child( child.replace_additive_base(prune_node(child.additive_base, needed_ids)) ) - elif isinstance(child, bigframes.core.nodes.ConcatNode): + elif isinstance(child, nodes.ConcatNode): indices = [ list(child.ids).index(ref.id) for ref, _ in selection.input_output_pairs ] new_children = [] for concat_node in child.child_nodes: cc_ids = tuple(concat_node.ids) - sub_selection = tuple( - bigframes.core.nodes.AliasedRef.identity(cc_ids[i]) for i in indices - ) - new_children.append( - bigframes.core.nodes.SelectionNode(concat_node, sub_selection) - ) - return bigframes.core.nodes.ConcatNode( + sub_selection = tuple(nodes.AliasedRef.identity(cc_ids[i]) for i in indices) + new_children.append(nodes.SelectionNode(concat_node, sub_selection)) + return nodes.ConcatNode( children=tuple(new_children), output_ids=tuple(selection.ids) ) # Nodes that pass through input columns elif isinstance( child, ( - bigframes.core.nodes.RandomSampleNode, - bigframes.core.nodes.ReversedNode, - bigframes.core.nodes.OrderByNode, - bigframes.core.nodes.FilterNode, - bigframes.core.nodes.SliceNode, - bigframes.core.nodes.JoinNode, - bigframes.core.nodes.ExplodeNode, + nodes.RandomSampleNode, + nodes.ReversedNode, + nodes.OrderByNode, + nodes.FilterNode, + nodes.SliceNode, + nodes.JoinNode, + nodes.ExplodeNode, ), ): ids = selection.consumed_ids | child.referenced_ids return selection.replace_child( child.transform_children(lambda x: prune_node(x, ids)) ) - elif isinstance(child, bigframes.core.nodes.AggregateNode): + elif isinstance(child, nodes.AggregateNode): return selection.replace_child(prune_aggregate(child, selection.consumed_ids)) - elif isinstance(child, bigframes.core.nodes.LeafNode): + elif isinstance(child, nodes.LeafNode): return selection.replace_child(prune_leaf(child, selection.consumed_ids)) return selection def prune_node( - node: bigframes.core.nodes.BigFrameNode, + node: nodes.BigFrameNode, ids: AbstractSet[identifiers.ColumnId], ): # This clause is important, ensures idempotency, so can reach fixed point if not (set(node.ids) - ids): return node else: - return bigframes.core.nodes.SelectionNode( + return nodes.SelectionNode( node, - tuple( - bigframes.core.nodes.AliasedRef.identity(id) - for id in node.ids - if id in ids - ), + tuple(nodes.AliasedRef.identity(id) for id in node.ids if id in ids), ) def prune_aggregate( - node: bigframes.core.nodes.AggregateNode, + node: nodes.AggregateNode, used_cols: AbstractSet[identifiers.ColumnId], -) -> bigframes.core.nodes.AggregateNode: +) -> nodes.AggregateNode: pruned_aggs = tuple(agg for agg in node.aggregations if agg[1] in used_cols) return dataclasses.replace(node, aggregations=pruned_aggs) @functools.singledispatch def prune_leaf( - node: bigframes.core.nodes.BigFrameNode, + node: nodes.BigFrameNode, used_cols: AbstractSet[identifiers.ColumnId], ): ... @@ -167,9 +160,9 @@ def prune_leaf( @prune_leaf.register def prune_readlocal( - node: bigframes.core.nodes.ReadLocalNode, + node: nodes.ReadLocalNode, selection: AbstractSet[identifiers.ColumnId], -) -> bigframes.core.nodes.ReadLocalNode: +) -> nodes.ReadLocalNode: new_scan_list = node.scan_list.filter_cols(selection) return dataclasses.replace( node, @@ -180,8 +173,8 @@ def prune_readlocal( @prune_leaf.register def prune_readtable( - node: bigframes.core.nodes.ReadTableNode, + node: nodes.ReadTableNode, selection: AbstractSet[identifiers.ColumnId], -) -> bigframes.core.nodes.ReadTableNode: +) -> nodes.ReadTableNode: new_scan_list = node.scan_list.filter_cols(selection) return dataclasses.replace(node, scan_list=new_scan_list) diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 87a7720e2f..b8a003e061 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import dataclasses import functools from typing import Optional, Sequence, Tuple @@ -24,7 +25,19 @@ import bigframes.operations as ops -def pullup_limit_from_slice( +def pull_up_limits(root: nodes.ResultNode) -> nodes.ResultNode: + new_child, pulled_limit = _pullup_slice_inner(root.child) + if new_child == root.child: + return root + elif pulled_limit is None: + return dataclasses.replace(root, child=new_child) + else: + # new child has redundant slice ops removed now + new_limit = min(pulled_limit, root.limit) if root.limit else pulled_limit + return dataclasses.replace(root, child=new_child, limit=new_limit) + + +def _pullup_slice_inner( root: nodes.BigFrameNode, ) -> Tuple[nodes.BigFrameNode, Optional[int]]: """ @@ -40,7 +53,7 @@ def pullup_limit_from_slice( assert root.step == 1 assert root.stop is not None limit = root.stop - new_root, prior_limit = pullup_limit_from_slice(root.child) + new_root, prior_limit = _pullup_slice_inner(root.child) if (prior_limit is not None) and (prior_limit < limit): limit = prior_limit return new_root, limit @@ -48,7 +61,7 @@ def pullup_limit_from_slice( isinstance(root, (nodes.SelectionNode, nodes.ProjectionNode)) and root.row_preserving ): - new_child, prior_limit = pullup_limit_from_slice(root.child) + new_child, prior_limit = _pullup_slice_inner(root.child) if prior_limit is not None: return root.transform_children(lambda _: new_child), prior_limit # Most ops don't support pulling up slice, like filter, agg, join, etc. From 58e7cb025a86959164643cebb725c853dc2ebc34 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 21 Apr 2025 17:52:17 -0700 Subject: [PATCH 09/12] fix: add retry for test_clean_up_via_context_manager (#1627) --- tests/system/large/test_session.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index 1dac8c851e..90955f5ddf 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -151,6 +151,7 @@ def test_clean_up_by_session_id(): pytest.param(bigframes.connect, id="connect-method"), ], ) +@pytest.mark.flaky(retries=3) def test_clean_up_via_context_manager(session_creator): # we will create two tables and confirm that they are deleted # when the session is closed From 8cc56d5118017beb2931519ddd1eb8e151852849 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Mon, 21 Apr 2025 22:25:30 -0700 Subject: [PATCH 10/12] fix: improve robustness of managed udf code extraction (#1634) * fix: improve robustness of managed udf code extraction * resolve comments * quick fix --- bigframes/functions/_function_client.py | 6 +++++- tests/system/large/functions/test_managed_function.py | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 8a591f6916..0cc3d52c38 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -19,6 +19,7 @@ import logging import os import random +import re import shutil import string import tempfile @@ -272,7 +273,10 @@ def provision_bq_managed_function( # i.e. there are no references to variables or imports outside the # body. udf_code = textwrap.dedent(inspect.getsource(func)) - udf_code = udf_code[udf_code.index("def") :] + match = re.search(r"^def ", udf_code, flags=re.MULTILINE) + if match is None: + raise ValueError("The UDF is not defined correctly.") + udf_code = udf_code[match.start() :] with_connection_clause = ( ( diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index a15bce83ad..9eba1907e6 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -171,7 +171,12 @@ def featurize(x: int) -> list[float]: def test_managed_function_series_apply(session, dataset_id, scalars_dfs): try: - @session.udf(dataset=dataset_id, name=prefixer.create_prefix()) + # An explicit name with "def" in it is used to test the robustness of + # the user code extraction logic, which depends on that term. + bq_name = f"{prefixer.create_prefix()}_def_to_test_code_extraction" + assert "def" in bq_name, "The substring 'def' was not found in 'bq_name'" + + @session.udf(dataset=dataset_id, name=bq_name) def foo(x: int) -> bytes: return bytes(abs(x)) From bf1ae7091a02ad28d222fa63d311ed5ef3800807 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 22 Apr 2025 11:02:21 -0500 Subject: [PATCH 11/12] feat: add `bigframes.bigquery.st_distance` function (#1637) * feat: add `bigframes.bigquery.st_distance` function * fix docstring * add tests * add tests * type checks * make sure shapely.Point is available * fix docstrings, add null row test * GeoSereies typo * Update bigframes/dtypes.py * Update bigframes/dtypes.py * Update third_party/bigframes_vendored/geopandas/geoseries.py --- bigframes/bigquery/__init__.py | 8 +- bigframes/bigquery/_operations/geo.py | 161 +++++++++---- bigframes/core/compile/scalar_op_compiler.py | 12 + bigframes/dtypes.py | 24 +- bigframes/geopandas/geoseries.py | 22 +- bigframes/operations/__init__.py | 2 + bigframes/operations/geo_ops.py | 11 + setup.py | 2 +- testing/constraints-3.9.txt | 2 +- tests/system/small/bigquery/test_geo.py | 223 +++++++++++++----- .../system/small/geopandas/test_geoseries.py | 28 +++ tests/unit/core/test_dtypes.py | 17 ++ .../bigframes_vendored/geopandas/geoseries.py | 51 ++++ 13 files changed, 424 insertions(+), 139 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index c04350275d..6eb725975e 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -27,7 +27,12 @@ unix_millis, unix_seconds, ) -from bigframes.bigquery._operations.geo import st_area, st_difference, st_intersection +from bigframes.bigquery._operations.geo import ( + st_area, + st_difference, + st_distance, + st_intersection, +) from bigframes.bigquery._operations.json import ( json_extract, json_extract_array, @@ -49,6 +54,7 @@ # geo ops "st_area", "st_difference", + "st_distance", "st_intersection", # json ops "json_set", diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py index f2d8b7b577..262dca6d6b 100644 --- a/bigframes/bigquery/_operations/geo.py +++ b/bigframes/bigquery/_operations/geo.py @@ -14,8 +14,11 @@ from __future__ import annotations +from typing import Union + +import shapely # type: ignore + from bigframes import operations as ops -import bigframes.dtypes import bigframes.geopandas import bigframes.series @@ -25,7 +28,9 @@ """ -def st_area(series: bigframes.series.Series) -> bigframes.series.Series: +def st_area( + series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], +) -> bigframes.series.Series: """ Returns the area in square meters covered by the polygons in the input `GEOGRAPHY`. @@ -85,6 +90,10 @@ def st_area(series: bigframes.series.Series) -> bigframes.series.Series: 4 0.0 dtype: Float64 + Args: + series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries): + A series containing geography objects. + Returns: bigframes.pandas.Series: Series of float representing the areas. @@ -95,7 +104,10 @@ def st_area(series: bigframes.series.Series) -> bigframes.series.Series: def st_difference( - series: bigframes.series.Series, other: bigframes.series.Series + series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], + other: Union[ + bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry + ], ) -> bigframes.series.Series: """ Returns a `GEOGRAPHY` that represents the point set difference of @@ -166,44 +178,23 @@ def st_difference( 5 None dtype: geometry - We can also check difference of single shapely geometries: - - >>> polygon_s1 = bigframes.geopandas.GeoSeries( - ... [ - ... Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]) - ... ] - ... ) - >>> polygon_s2 = bigframes.geopandas.GeoSeries( - ... [ - ... Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]) - ... ] - ... ) - - >>> polygon_s1 - 0 POLYGON ((0 0, 10 0, 10 10, 0 0)) - dtype: geometry - - >>> polygon_s2 - 0 POLYGON ((4 2, 6 2, 8 6, 4 2)) - dtype: geometry - - >>> bbq.st_difference(polygon_s1, polygon_s2) - 0 POLYGON ((0 0, 10 0, 10 10, 0 0), (8 6, 6 2, 4... - dtype: geometry - Additionally, we can check difference of a GeoSeries against a single shapely geometry: - >>> bbq.st_difference(s1, polygon_s2) - 0 POLYGON ((0 0, 2 2, 0 2, 0 0)) - 1 None - 2 None - 3 None - 4 None + >>> polygon = Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]) + >>> bbq.st_difference(s1, polygon) + 0 POLYGON ((1.97082 2.00002, 0 2, 0 0, 1.97082 2... + 1 POLYGON ((1.97082 2.00002, 0 2, 0 0, 1.97082 2... + 2 GEOMETRYCOLLECTION EMPTY + 3 LINESTRING (0.99265 1.00781, 0 2) + 4 POINT (0 1) dtype: geometry Args: - other (bigframes.series.Series or geometric object): - The GeoSeries (elementwise) or geometric object to find the difference to. + series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries): + A series containing geography objects. + other (bigframes.pandas.Series | bigframes.geopandas.GeoSeries | shapely.Geometry): + The series or geometric object to subtract from the geography + objects in ``series``. Returns: bigframes.series.Series: @@ -213,8 +204,86 @@ def st_difference( return series._apply_binary_op(other, ops.geo_st_difference_op) +def st_distance( + series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], + other: Union[ + bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry + ], + *, + use_spheroid: bool = False, +) -> bigframes.series.Series: + """ + Returns the shortest distance in meters between two non-empty + ``GEOGRAPHY`` objects. + + **Examples:** + + >>> import bigframes as bpd + >>> import bigframes.bigquery as bbq + >>> import bigframes.geopandas + >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + + We can check two GeoSeries against each other, row by row. + + >>> s1 = bigframes.geopandas.GeoSeries( + ... [ + ... Point(0, 0), + ... Point(0.00001, 0), + ... Point(0.00002, 0), + ... ], + ... ) + >>> s2 = bigframes.geopandas.GeoSeries( + ... [ + ... Point(0.00001, 0), + ... Point(0.00003, 0), + ... Point(0.00005, 0), + ... ], + ... ) + + >>> bbq.st_distance(s1, s2, use_spheroid=True) + 0 1.113195 + 1 2.22639 + 2 3.339585 + dtype: Float64 + + We can also calculate the distance of each geometry and a single shapely geometry: + + >>> bbq.st_distance(s2, Point(0.00001, 0)) + 0 0.0 + 1 2.223902 + 2 4.447804 + dtype: Float64 + + Args: + series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries): + A series containing geography objects. + other (bigframes.pandas.Series | bigframes.geopandas.GeoSeries | shapely.Geometry): + The series or geometric object to calculate the distance in meters + to from the geography objects in ``series``. + use_spheroid (optional, default ``False``): + Determines how this function measures distance. If ``use_spheroid`` + is False, the function measures distance on the surface of a perfect + sphere. If ``use_spheroid`` is True, the function measures distance + on the surface of the `WGS84 spheroid + `_. The + default value of ``use_spheroid`` is False. + + Returns: + bigframes.pandas.Series: + The Series (elementwise) of the smallest distance between + each aligned geometry with other. + """ + return series._apply_binary_op( + other, ops.GeoStDistanceOp(use_spheroid=use_spheroid) + ) + + def st_intersection( - series: bigframes.series.Series, other: bigframes.series.Series + series: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries], + other: Union[ + bigframes.series.Series, bigframes.geopandas.GeoSeries, shapely.Geometry + ], ) -> bigframes.series.Series: """ Returns a `GEOGRAPHY` that represents the point set intersection of the two @@ -284,18 +353,20 @@ def st_intersection( We can also do intersection of each geometry and a single shapely geometry: - >>> bbq.st_intersection(s1, bigframes.geopandas.GeoSeries([Polygon([(0, 0), (1, 1), (0, 1)])])) + >>> bbq.st_intersection(s1, Polygon([(0, 0), (1, 1), (0, 1)])) 0 POLYGON ((0 0, 0.99954 1, 0 1, 0 0)) - 1 None - 2 None - 3 None - 4 None + 1 POLYGON ((0 0, 0.99954 1, 0 1, 0 0)) + 2 LINESTRING (0 0, 0.99954 1) + 3 GEOMETRYCOLLECTION EMPTY + 4 POINT (0 1) dtype: geometry Args: - other (GeoSeries or geometric object): - The Geoseries (elementwise) or geometric object to find the - intersection with. + series (bigframes.pandas.Series | bigframes.geopandas.GeoSeries): + A series containing geography objects. + other (bigframes.pandas.Series | bigframes.geopandas.GeoSeries | shapely.Geometry): + The series or geometric object to intersect with the geography + objects in ``series``. Returns: bigframes.geopandas.GeoSeries: diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index eda70f5cf1..8243627a91 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1023,6 +1023,13 @@ def geo_st_difference_op_impl(x: ibis_types.Value, y: ibis_types.Value): ) +@scalar_op_compiler.register_binary_op(ops.GeoStDistanceOp, pass_op=True) +def geo_st_distance_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.GeoStDistanceOp +): + return st_distance(x, y, op.use_spheroid) + + @scalar_op_compiler.register_unary_op(ops.geo_st_geogfromtext_op) def geo_st_geogfromtext_op_impl(x: ibis_types.Value): # Ibis doesn't seem to provide a dedicated method to cast from string to geography, @@ -1989,6 +1996,11 @@ def st_boundary(a: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ign """Find the boundary of a geography.""" +@ibis_udf.scalar.builtin +def st_distance(a: ibis_dtypes.geography, b: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore + """Convert string to geography.""" + + @ibis_udf.scalar.builtin def unix_micros(a: ibis_dtypes.timestamp) -> int: # type: ignore """Convert a timestamp to microseconds""" diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 47b128dae6..de6c331043 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -586,30 +586,32 @@ def _is_bigframes_dtype(dtype) -> bool: return False -def _infer_dtype_from_python_type(type: type) -> Dtype: - if type in (datetime.timedelta, pd.Timedelta, np.timedelta64): +def _infer_dtype_from_python_type(type_: type) -> Dtype: + if type_ in (datetime.timedelta, pd.Timedelta, np.timedelta64): # Must check timedelta type first. Otherwise other branchs will be evaluated to true # E.g. np.timedelta64 is a sublcass as np.integer return TIMEDELTA_DTYPE - if issubclass(type, (bool, np.bool_)): + if issubclass(type_, (bool, np.bool_)): return BOOL_DTYPE - if issubclass(type, (int, np.integer)): + if issubclass(type_, (int, np.integer)): return INT_DTYPE - if issubclass(type, (float, np.floating)): + if issubclass(type_, (float, np.floating)): return FLOAT_DTYPE - if issubclass(type, decimal.Decimal): + if issubclass(type_, decimal.Decimal): return NUMERIC_DTYPE - if issubclass(type, (str, np.str_)): + if issubclass(type_, (str, np.str_)): return STRING_DTYPE - if issubclass(type, (bytes, np.bytes_)): + if issubclass(type_, (bytes, np.bytes_)): return BYTES_DTYPE - if issubclass(type, datetime.date): + if issubclass(type_, datetime.date): return DATE_DTYPE - if issubclass(type, datetime.time): + if issubclass(type_, datetime.time): return TIME_DTYPE + if issubclass(type_, shapely.Geometry): + return GEO_DTYPE else: raise TypeError( - f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}" + f"No matching datatype for python type: {type_}. {constants.FEEDBACK_LINK}" ) diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index c93a02deb8..38ebda7d92 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -47,23 +47,6 @@ def y(self) -> bigframes.series.Series: # we can. @property def area(self, crs=None) -> bigframes.series.Series: # type: ignore - """Returns a Series containing the area of each geometry in the GeoSeries - expressed in the units of the CRS. - - Args: - crs (optional): - Coordinate Reference System of the geometry objects. Can be - anything accepted by pyproj.CRS.from_user_input(), such as an - authority string (eg “EPSG:4326”) or a WKT string. - - Returns: - bigframes.pandas.Series: - Series of float representing the areas. - - Raises: - NotImplementedError: - GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. - """ raise NotImplementedError( f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" ) @@ -97,5 +80,10 @@ def to_wkt(self: GeoSeries) -> bigframes.series.Series: def difference(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: # type: ignore return self._apply_binary_op(other, ops.geo_st_difference_op) + def distance(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: # type: ignore + raise NotImplementedError( + f"GeoSeries.distance is not supported. Use bigframes.bigquery.st_distance(series, other), instead. {constants.FEEDBACK_LINK}" + ) + def intersection(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: # type: ignore return self._apply_binary_op(other, ops.geo_st_intersection_op) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 3e0ebd5089..74ff5c0f98 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -96,6 +96,7 @@ geo_st_intersection_op, geo_x_op, geo_y_op, + GeoStDistanceOp, ) from bigframes.operations.json_ops import ( JSONExtract, @@ -375,6 +376,7 @@ "geo_st_intersection_op", "geo_x_op", "geo_y_op", + "GeoStDistanceOp", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 6f988c2585..98da9099cd 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses + from bigframes import dtypes from bigframes.operations import base_ops import bigframes.operations.type as op_typing @@ -69,3 +71,12 @@ geo_st_intersection_op = base_ops.create_binary_op( name="geo_st_intersection", type_signature=op_typing.BinaryGeo() ) + + +@dataclasses.dataclass(frozen=True) +class GeoStDistanceOp(base_ops.BinaryOp): + name = "st_distance" + use_spheroid: bool + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.FLOAT_DTYPE diff --git a/setup.py b/setup.py index edc77e11b6..1fe7006860 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "pyarrow >=15.0.2", "pydata-google-auth >=1.8.2", "requests >=2.27.1", - "shapely >=1.8.5", + "shapely >=2.0.0", "sqlglot >=23.6.3", "tabulate >=0.9", "ipywidgets >=7.7.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index dff245d176..b0537cd035 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -19,7 +19,7 @@ pyarrow==15.0.2 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 -shapely==1.8.5 +shapely==2.0.0 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 diff --git a/tests/system/small/bigquery/test_geo.py b/tests/system/small/bigquery/test_geo.py index c842f1c99d..fa2c522109 100644 --- a/tests/system/small/bigquery/test_geo.py +++ b/tests/system/small/bigquery/test_geo.py @@ -14,6 +14,7 @@ import geopandas # type: ignore import pandas as pd +import pandas.testing from shapely.geometry import ( # type: ignore GeometryCollection, LineString, @@ -23,7 +24,6 @@ import bigframes.bigquery as bbq import bigframes.geopandas -import bigframes.series def test_geo_st_area(): @@ -54,7 +54,7 @@ def test_geo_st_area(): check_dtype=False, check_index_type=False, check_exact=False, - rtol=1, + rtol=0.1, ) @@ -75,7 +75,7 @@ def test_geo_st_difference_with_geometry_objects(): geobf_s2 = bigframes.geopandas.GeoSeries(data=data2) geobf_s_result = bbq.st_difference(geobf_s1, geobf_s2).to_pandas() - expected = bigframes.series.Series( + expected = pd.Series( [ GeometryCollection([]), GeometryCollection([]), @@ -83,46 +83,45 @@ def test_geo_st_difference_with_geometry_objects(): ], index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), - ).to_pandas() - - assert geobf_s_result.dtype == "geometry" - assert expected.iloc[0].equals(geobf_s_result.iloc[0]) - assert expected.iloc[1].equals(geobf_s_result.iloc[1]) - assert expected.iloc[2].equals(geobf_s_result.iloc[2]) + ) + pandas.testing.assert_series_equal( + geobf_s_result, + expected, + check_index_type=False, + check_exact=False, + rtol=0.1, + ) def test_geo_st_difference_with_single_geometry_object(): data1 = [ - Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), - Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + Polygon([(0, 0), (10, 0), (10, 10), (0, 10), (0, 0)]), + Polygon([(0, 1), (10, 1), (10, 9), (0, 9), (0, 1)]), Point(0, 1), ] geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) geobf_s_result = bbq.st_difference( geobf_s1, - bigframes.geopandas.GeoSeries( - [ - Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), - Polygon([(1, 0), (0, 5), (0, 0), (1, 0)]), - ] - ), + Polygon([(0, 0), (10, 0), (10, 5), (0, 5), (0, 0)]), ).to_pandas() - expected = bigframes.series.Series( + expected = pd.Series( [ + Polygon([(10, 5), (10, 10), (0, 10), (0, 5), (10, 5)]), + Polygon([(10, 5), (10, 9), (0, 9), (0, 5), (10, 5)]), GeometryCollection([]), - Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), - None, ], index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), - ).to_pandas() - - assert geobf_s_result.dtype == "geometry" - assert (expected.iloc[0]).equals(geobf_s_result.iloc[0]) - assert expected.iloc[1] == geobf_s_result.iloc[1] - assert expected.iloc[2] == geobf_s_result.iloc[2] + ) + pandas.testing.assert_series_equal( + geobf_s_result, + expected, + check_index_type=False, + check_exact=False, + rtol=0.1, + ) def test_geo_st_difference_with_similar_geometry_objects(): @@ -135,16 +134,113 @@ def test_geo_st_difference_with_similar_geometry_objects(): geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) geobf_s_result = bbq.st_difference(geobf_s1, geobf_s1).to_pandas() - expected = bigframes.series.Series( + expected = pd.Series( [GeometryCollection([]), GeometryCollection([]), GeometryCollection([])], index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), + ) + pandas.testing.assert_series_equal( + geobf_s_result, + expected, + check_index_type=False, + check_exact=False, + rtol=0.1, + ) + + +def test_geo_st_distance_with_geometry_objects(): + data1 = [ + # 0.00001 is approximately 1 meter. + Polygon([(0, 0), (0.00001, 0), (0.00001, 0.00001), (0, 0.00001), (0, 0)]), + Polygon( + [ + (0.00002, 0), + (0.00003, 0), + (0.00003, 0.00001), + (0.00002, 0.00001), + (0.00002, 0), + ] + ), + Point(0, 0.00002), + ] + + data2 = [ + Polygon( + [ + (0.00002, 0), + (0.00003, 0), + (0.00003, 0.00001), + (0.00002, 0.00001), + (0.00002, 0), + ] + ), + Point(0, 0.00002), + Polygon([(0, 0), (0.00001, 0), (0.00001, 0.00001), (0, 0.00001), (0, 0)]), + Point( + 1, 1 + ), # No matching row in data1, so this will be NULL after the call to distance. + ] + + geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) + geobf_s2 = bigframes.geopandas.GeoSeries(data=data2) + geobf_s_result = bbq.st_distance(geobf_s1, geobf_s2).to_pandas() + + expected = pd.Series( + [ + 1.112, + 2.486, + 1.112, + None, + ], + index=[0, 1, 2, 3], + dtype="Float64", + ) + pandas.testing.assert_series_equal( + geobf_s_result, + expected, + check_index_type=False, + check_exact=False, + rtol=0.1, + ) + + +def test_geo_st_distance_with_single_geometry_object(): + data1 = [ + # 0.00001 is approximately 1 meter. + Polygon([(0, 0), (0.00001, 0), (0.00001, 0.00001), (0, 0.00001), (0, 0)]), + Polygon( + [ + (0.00001, 0), + (0.00002, 0), + (0.00002, 0.00001), + (0.00001, 0.00001), + (0.00001, 0), + ] + ), + Point(0, 0.00002), + ] + + geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) + geobf_s_result = bbq.st_distance( + geobf_s1, + Point(0, 0), ).to_pandas() - assert geobf_s_result.dtype == "geometry" - assert expected.iloc[0].equals(geobf_s_result.iloc[0]) - assert expected.iloc[1].equals(geobf_s_result.iloc[1]) - assert expected.iloc[2].equals(geobf_s_result.iloc[2]) + expected = pd.Series( + [ + 0, + 1.112, + 2.224, + ], + dtype="Float64", + ) + pandas.testing.assert_series_equal( + geobf_s_result, + expected, + check_index_type=False, + check_exact=False, + rtol=0.1, + ) def test_geo_st_intersection_with_geometry_objects(): @@ -164,7 +260,7 @@ def test_geo_st_intersection_with_geometry_objects(): geobf_s2 = bigframes.geopandas.GeoSeries(data=data2) geobf_s_result = bbq.st_intersection(geobf_s1, geobf_s2).to_pandas() - expected = bigframes.series.Series( + expected = pd.Series( [ Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), Polygon([(0, 0), (1, 1), (0, 1), (0, 0)]), @@ -172,46 +268,45 @@ def test_geo_st_intersection_with_geometry_objects(): ], index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), - ).to_pandas() - - assert geobf_s_result.dtype == "geometry" - assert expected.iloc[0].equals(geobf_s_result.iloc[0]) - assert expected.iloc[1].equals(geobf_s_result.iloc[1]) - assert expected.iloc[2].equals(geobf_s_result.iloc[2]) + ) + pandas.testing.assert_series_equal( + geobf_s_result, + expected, + check_index_type=False, + check_exact=False, + rtol=0.1, + ) def test_geo_st_intersection_with_single_geometry_object(): data1 = [ - Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), - Polygon([(4, 2), (6, 2), (8, 6), (4, 2)]), + Polygon([(0, 0), (10, 0), (10, 10), (0, 10), (0, 0)]), + Polygon([(0, 1), (10, 1), (10, 9), (0, 9), (0, 1)]), Point(0, 1), ] geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) geobf_s_result = bbq.st_intersection( geobf_s1, - bigframes.geopandas.GeoSeries( - [ - Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), - Polygon([(1, 0), (0, 5), (0, 0), (1, 0)]), - ] - ), + Polygon([(0, 0), (10, 0), (10, 5), (0, 5), (0, 0)]), ).to_pandas() - expected = bigframes.series.Series( + expected = pd.Series( [ - Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), - GeometryCollection([]), - None, + Polygon([(0, 0), (10, 0), (10, 5), (0, 5), (0, 0)]), + Polygon([(0, 1), (10, 1), (10, 5), (0, 5), (0, 1)]), + Point(0, 1), ], index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), - ).to_pandas() - - assert geobf_s_result.dtype == "geometry" - assert (expected.iloc[0]).equals(geobf_s_result.iloc[0]) - assert expected.iloc[1] == geobf_s_result.iloc[1] - assert expected.iloc[2] == geobf_s_result.iloc[2] + ) + pandas.testing.assert_series_equal( + geobf_s_result, + expected, + check_index_type=False, + check_exact=False, + rtol=0.1, + ) def test_geo_st_intersection_with_similar_geometry_objects(): @@ -224,7 +319,7 @@ def test_geo_st_intersection_with_similar_geometry_objects(): geobf_s1 = bigframes.geopandas.GeoSeries(data=data1) geobf_s_result = bbq.st_intersection(geobf_s1, geobf_s1).to_pandas() - expected = bigframes.series.Series( + expected = pd.Series( [ Polygon([(0, 0), (10, 0), (10, 10), (0, 0)]), Polygon([(0, 0), (1, 1), (0, 1)]), @@ -232,9 +327,11 @@ def test_geo_st_intersection_with_similar_geometry_objects(): ], index=[0, 1, 2], dtype=geopandas.array.GeometryDtype(), - ).to_pandas() - - assert geobf_s_result.dtype == "geometry" - assert expected.iloc[0].equals(geobf_s_result.iloc[0]) - assert expected.iloc[1].equals(geobf_s_result.iloc[1]) - assert expected.iloc[2].equals(geobf_s_result.iloc[2]) + ) + pandas.testing.assert_series_equal( + geobf_s_result, + expected, + check_index_type=False, + check_exact=False, + rtol=0.1, + ) diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index 18f3ff2675..ae99fd6fc2 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -29,6 +29,7 @@ ) import bigframes.geopandas +import bigframes.pandas import bigframes.series from tests.system.utils import assert_series_equal @@ -95,6 +96,33 @@ def test_geo_area_not_supported(): bf_series.area +def test_geo_distance_not_supported(): + s1 = bigframes.pandas.Series( + [ + Polygon([(0, 0), (1, 1), (0, 1)]), + Polygon([(10, 0), (10, 5), (0, 0)]), + Polygon([(0, 0), (2, 2), (2, 0)]), + LineString([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ], + dtype=GeometryDtype(), + ) + s2 = bigframes.geopandas.GeoSeries( + [ + Polygon([(0, 0), (1, 1), (0, 1)]), + Polygon([(10, 0), (10, 5), (0, 0)]), + Polygon([(0, 0), (2, 2), (2, 0)]), + LineString([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ] + ) + with pytest.raises( + NotImplementedError, + match=re.escape("GeoSeries.distance is not supported."), + ): + s1.geo.distance(s2) + + def test_geo_from_xy(): x = [2.5, 5, -3.0] y = [0.5, 1, 1.5] diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index 3d420de51f..bbeac3602b 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -20,6 +20,7 @@ import pandas as pd import pyarrow as pa # type: ignore import pytest +import shapely # type: ignore import bigframes.core.compile.ibis_types import bigframes.dtypes @@ -224,6 +225,22 @@ def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str): assert result == ibis_dtype +@pytest.mark.parametrize( + ["python_type", "expected_dtype"], + [ + (bool, bigframes.dtypes.BOOL_DTYPE), + (int, bigframes.dtypes.INT_DTYPE), + (str, bigframes.dtypes.STRING_DTYPE), + (shapely.Point, bigframes.dtypes.GEO_DTYPE), + (shapely.Polygon, bigframes.dtypes.GEO_DTYPE), + (shapely.Geometry, bigframes.dtypes.GEO_DTYPE), + ], +) +def test_bigframes_type_supports_python_types(python_type, expected_dtype): + got_dtype = bigframes.dtypes.bigframes_type(python_type) + assert got_dtype == expected_dtype + + def test_unsupported_dtype_raises_unexpected_datatype(): """Incompatible dtypes should fail when passed into BigQuery DataFrames""" with pytest.raises(ValueError, match="Datatype has no ibis type mapping"): diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index 4ad4f383cf..613a929c04 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -37,6 +37,33 @@ class GeoSeries: e.g. ``name``. """ + # GeoSeries.area overrides Series.area with something totally different. + # Ignore this type error, as we are trying to be as close to geopandas as + # we can. + @property + def area(self, crs=None) -> bigframes.series.Series: # type: ignore + """[Not Implemented] Use ``bigframes.bigquery.st_area(series)``, + instead to return the area in square meters. + + In GeoPandas, this returns a Series containing the area of each geometry + in the GeoSeries expressed in the units of the CRS. + + Args: + crs (optional): + Coordinate Reference System of the geometry objects. Can be + anything accepted by pyproj.CRS.from_user_input(), such as an + authority string (eg “EPSG:4326”) or a WKT string. + + Returns: + bigframes.pandas.Series: + Series of float representing the areas. + + Raises: + NotImplementedError: + GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def x(self) -> bigframes.series.Series: """Return the x location of point geometries in a GeoSeries @@ -348,6 +375,30 @@ def difference(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def distance(self: GeoSeries, other: GeoSeries) -> bigframes.series.Series: + """ + [Not Implemented] Use ``bigframes.bigquery.st_distance(series, other)`` + instead to return the shorted distance between two + ``GEOGRAPHY`` objects in meters. + + In GeoPandas, this returns a Series of the distances between each + aligned geometry in the expressed in the units of the CRS. + + Args: + other: + The Geoseries (elementwise) or geometric object to find the distance to. + + Returns: + bigframes.pandas.Series: + Series of float representing the distances. + + Raises: + NotImplementedError: + GeoSeries.distance is not supported. Use + ``bigframes.bigquery.st_distance(series, other)``, instead. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def intersection(self: GeoSeries, other: GeoSeries) -> GeoSeries: # type: ignore """ Returns a GeoSeries of the intersection of points in each aligned From 87139505d849eb8c962f47181e69512337af282d Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 22 Apr 2025 11:37:59 -0500 Subject: [PATCH 12/12] chore(main): release 2.1.0 (#1633) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 20 ++++++++++++++++++++ bigframes/version.py | 4 ++-- third_party/bigframes_vendored/version.py | 4 ++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 667273167b..3b1e331d1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,26 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.1.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.0.0...v2.1.0) (2025-04-22) + + +### Features + +* Add `bigframes.bigquery.st_distance` function ([#1637](https://github.com/googleapis/python-bigquery-dataframes/issues/1637)) ([bf1ae70](https://github.com/googleapis/python-bigquery-dataframes/commit/bf1ae7091a02ad28d222fa63d311ed5ef3800807)) +* Enable local json string validations ([#1614](https://github.com/googleapis/python-bigquery-dataframes/issues/1614)) ([233347a](https://github.com/googleapis/python-bigquery-dataframes/commit/233347aca0ac55b2407e0f49430bf13536986e25)) +* Enhance `read_csv` `index_col` parameter support ([#1631](https://github.com/googleapis/python-bigquery-dataframes/issues/1631)) ([f4e5b26](https://github.com/googleapis/python-bigquery-dataframes/commit/f4e5b26b7b7b00ef807987c4b9c5fded56ad883f)) + + +### Bug Fixes + +* Add retry for test_clean_up_via_context_manager ([#1627](https://github.com/googleapis/python-bigquery-dataframes/issues/1627)) ([58e7cb0](https://github.com/googleapis/python-bigquery-dataframes/commit/58e7cb025a86959164643cebb725c853dc2ebc34)) +* Improve robustness of managed udf code extraction ([#1634](https://github.com/googleapis/python-bigquery-dataframes/issues/1634)) ([8cc56d5](https://github.com/googleapis/python-bigquery-dataframes/commit/8cc56d5118017beb2931519ddd1eb8e151852849)) + + +### Documentation + +* Add code samples in the `udf` API docstring ([#1632](https://github.com/googleapis/python-bigquery-dataframes/issues/1632)) ([f68b80c](https://github.com/googleapis/python-bigquery-dataframes/commit/f68b80cce2451a8c8d931a54e0cb69e02f34ce10)) + ## [2.0.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.42.0...v2.0.0) (2025-04-17) diff --git a/bigframes/version.py b/bigframes/version.py index e3a1d84bfa..b671169b24 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.0.0" +__version__ = "2.1.0" # {x-release-please-start-date} -__release_date__ = "2025-04-17" +__release_date__ = "2025-04-22" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index e3a1d84bfa..b671169b24 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.0.0" +__version__ = "2.1.0" # {x-release-please-start-date} -__release_date__ = "2025-04-17" +__release_date__ = "2025-04-22" # {x-release-please-end}