diff --git a/.kokoro/load/common.cfg b/.kokoro/load/common.cfg index 7f6fa7e0d9..d86932662d 100644 --- a/.kokoro/load/common.cfg +++ b/.kokoro/load/common.cfg @@ -8,4 +8,4 @@ action { } build_file: "python-bigquery-dataframes/.kokoro/build.sh" -timeout_mins: 360 +timeout_mins: 720 diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 5624df3b8d..7da0881bbe 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -106,6 +106,7 @@ for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \ # write access to COVERAGE_TABLE=bigframes-metrics.coverage_report.bigframes_coverage_nightly python3.10 scripts/publish_api_coverage.py \ + bigquery \ --bigframes_version=$BIGFRAMES_VERSION \ --release_version=$RELEASE_VERSION \ --bigquery_table=$COVERAGE_TABLE diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 517176da89..af05f4423c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,4 +38,4 @@ repos: rev: v1.1.1 hooks: - id: mypy - additional_dependencies: [types-requests, types-tabulate] + additional_dependencies: [types-requests, types-tabulate, pandas-stubs] diff --git a/CHANGELOG.md b/CHANGELOG.md index 72d0e833bb..bcb062f08f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,66 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.1.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.0.0...v1.1.0) (2024-04-04) + + +### Features + +* (Series|DataFrame).explode ([#556](https://github.com/googleapis/python-bigquery-dataframes/issues/556)) ([9e32f57](https://github.com/googleapis/python-bigquery-dataframes/commit/9e32f570b42c8ddae0c9b281b25beff91f0c922c)) +* Add `DataFrame.eval` and `DataFrame.query` ([#361](https://github.com/googleapis/python-bigquery-dataframes/issues/361)) ([5e28ebd](https://github.com/googleapis/python-bigquery-dataframes/commit/5e28ebd1ba3a5559e093c2ea676c0714c1434ba9)) +* Add ColumnTransformer save/load ([#541](https://github.com/googleapis/python-bigquery-dataframes/issues/541)) ([9d8cf67](https://github.com/googleapis/python-bigquery-dataframes/commit/9d8cf6792a8dbe03e03b102c454d15fcde7986af)) +* Add ml.metrics.mean_squared_error ([#559](https://github.com/googleapis/python-bigquery-dataframes/issues/559)) ([853c25e](https://github.com/googleapis/python-bigquery-dataframes/commit/853c25e8023bf877f28cda4dade0694d0299a83e)) +* Add support for numpy expm1, log1p, floor, ceil, arctan2 ops ([#505](https://github.com/googleapis/python-bigquery-dataframes/issues/505)) ([e8e66cf](https://github.com/googleapis/python-bigquery-dataframes/commit/e8e66cf25887f64d2a7cb26081c2ef3cea10827d)) +* Add transformers save/load ([#552](https://github.com/googleapis/python-bigquery-dataframes/issues/552)) ([d805241](https://github.com/googleapis/python-bigquery-dataframes/commit/d805241b7ec99fcb7579dce778d4b04778a72002)) +* Allow DataFrame binary ops to align on either axis and with loc… ([#544](https://github.com/googleapis/python-bigquery-dataframes/issues/544)) ([6d8f3af](https://github.com/googleapis/python-bigquery-dataframes/commit/6d8f3afe28d39eb15b969f50d37c58a2c3ff1967)) +* Expose `DataFrame.bqclient` to assist in integrations ([#519](https://github.com/googleapis/python-bigquery-dataframes/issues/519)) ([0be8911](https://github.com/googleapis/python-bigquery-dataframes/commit/0be891191ed89be77494e4dcda30fb37836842ac)) +* Read_pandas accepts pandas Series and Index objects ([#573](https://github.com/googleapis/python-bigquery-dataframes/issues/573)) ([f8821fe](https://github.com/googleapis/python-bigquery-dataframes/commit/f8821fe7ecf8a80532a6aab98044fad601ff939c)) +* Support `ML.GENERATE_EMBEDDING` in `PaLM2TextEmbeddingGenerator` ([#539](https://github.com/googleapis/python-bigquery-dataframes/issues/539)) ([1156c1e](https://github.com/googleapis/python-bigquery-dataframes/commit/1156c1e3ce8c1e62898dbe68ccd6c5ab3cd4068f)) +* Support max_columns in repr and make repr more efficient ([#515](https://github.com/googleapis/python-bigquery-dataframes/issues/515)) ([54e49cf](https://github.com/googleapis/python-bigquery-dataframes/commit/54e49cff89bd329852a823cd5cf5c5b41b7f9e32)) + + +### Bug Fixes + +* Assign NaN scalar to column error. ([#513](https://github.com/googleapis/python-bigquery-dataframes/issues/513)) ([0a4153c](https://github.com/googleapis/python-bigquery-dataframes/commit/0a4153cc71a44c09b8d691897f1e5afa58c69f25)) +* Don't download 100gb onto local python machine in load test ([#537](https://github.com/googleapis/python-bigquery-dataframes/issues/537)) ([082c58b](https://github.com/googleapis/python-bigquery-dataframes/commit/082c58bbe76821b90337dc5af0ab5fa7515682c2)) +* Exclude list-like s parameter in plot.scatter ([#568](https://github.com/googleapis/python-bigquery-dataframes/issues/568)) ([1caac27](https://github.com/googleapis/python-bigquery-dataframes/commit/1caac27fe95ef3eb36bad2ac351090891922858c)) +* Fix case where df.peek would fail to execute even with force=True ([#511](https://github.com/googleapis/python-bigquery-dataframes/issues/511)) ([8eca99a](https://github.com/googleapis/python-bigquery-dataframes/commit/8eca99a03bc4bdaccf15a979b5382f3659f2aac5)) +* Fix error in `Series.drop(0)` ([#575](https://github.com/googleapis/python-bigquery-dataframes/issues/575)) ([75dd786](https://github.com/googleapis/python-bigquery-dataframes/commit/75dd7862e60502c97f7defe5dfefb044ea74bae8)) +* Include all names in MultiIndex repr ([#564](https://github.com/googleapis/python-bigquery-dataframes/issues/564)) ([b188146](https://github.com/googleapis/python-bigquery-dataframes/commit/b188146466780e6f7a041f51f5be51a7d60719c9)) +* Plot.scatter s parameter cannot accept float-like column ([#563](https://github.com/googleapis/python-bigquery-dataframes/issues/563)) ([8d39187](https://github.com/googleapis/python-bigquery-dataframes/commit/8d3918761a17649180aa806d7b01aa103f69b4fe)) +* Product operation produces float result for all input types ([#501](https://github.com/googleapis/python-bigquery-dataframes/issues/501)) ([6873b30](https://github.com/googleapis/python-bigquery-dataframes/commit/6873b30b691a11a368308825a72013d8ec1408ed)) +* Reloaded transformer .transform error ([#569](https://github.com/googleapis/python-bigquery-dataframes/issues/569)) ([39fe474](https://github.com/googleapis/python-bigquery-dataframes/commit/39fe47451d24a8cf55d7dbb15c6d3b176d25ab18)) +* Rename PaLM2TextEmbeddingGenerator.predict output columns to be backward compatible ([#561](https://github.com/googleapis/python-bigquery-dataframes/issues/561)) ([4995c00](https://github.com/googleapis/python-bigquery-dataframes/commit/4995c0046265463bc5c502cbeb34c7632d5a255e)) +* Respect hard stack size limit and swallow limit change exception. ([#558](https://github.com/googleapis/python-bigquery-dataframes/issues/558)) ([4833908](https://github.com/googleapis/python-bigquery-dataframes/commit/483390830ae0ee2fe0fb47dc7d2aea143b2dc7d8)) +* Restore string to date/time type coercion ([#565](https://github.com/googleapis/python-bigquery-dataframes/issues/565)) ([4ae0262](https://github.com/googleapis/python-bigquery-dataframes/commit/4ae0262a2b1dfc35c1e4c3392b9e21456d6e964e)) +* Sync the notebook with embedding changes ([#550](https://github.com/googleapis/python-bigquery-dataframes/issues/550)) ([347f2dd](https://github.com/googleapis/python-bigquery-dataframes/commit/347f2dda2298e17cd44a298f04a723f2d20c080a)) +* Use bytes limit on frame inlining rather than element count ([#576](https://github.com/googleapis/python-bigquery-dataframes/issues/576)) ([659a161](https://github.com/googleapis/python-bigquery-dataframes/commit/659a161a53e93f66334cd04d1c3dc1f1f47ecc16)) + + +### Performance Improvements + +* Add multi-query execution capability for complex dataframes ([#427](https://github.com/googleapis/python-bigquery-dataframes/issues/427)) ([d2d7e33](https://github.com/googleapis/python-bigquery-dataframes/commit/d2d7e33b1f8b4e184ef3e76eedbd673a8fcee60e)) + + +### Dependencies + +* Include `pyarrow` as a dependency ([#529](https://github.com/googleapis/python-bigquery-dataframes/issues/529)) ([9b1525a](https://github.com/googleapis/python-bigquery-dataframes/commit/9b1525a0c359455160bfbc0dc1366e37982ad01f)) + + +### Documentation + +* `bigframes.options.bigquery.project` and `location` are optional in some circumstances ([#548](https://github.com/googleapis/python-bigquery-dataframes/issues/548)) ([90bcec5](https://github.com/googleapis/python-bigquery-dataframes/commit/90bcec5c73f7eefeff14bbd8bdcad3a4c9d91d8f)) +* Add "Supported pandas APIs" reference to the documentation ([#542](https://github.com/googleapis/python-bigquery-dataframes/issues/542)) ([74c3915](https://github.com/googleapis/python-bigquery-dataframes/commit/74c391586280b55c35d66c697167122d72c13386)) +* Add General Availability banner to README ([#507](https://github.com/googleapis/python-bigquery-dataframes/issues/507)) ([262ff59](https://github.com/googleapis/python-bigquery-dataframes/commit/262ff5922643039e037bd9b6c0a91b5bd20a4e08)) +* Add opeartions in API docs ([#557](https://github.com/googleapis/python-bigquery-dataframes/issues/557)) ([ea95761](https://github.com/googleapis/python-bigquery-dataframes/commit/ea9576125d46f3912372f75ebe51196ba83e96db)) +* Add progress_bar code sample ([#508](https://github.com/googleapis/python-bigquery-dataframes/issues/508)) ([92a1af3](https://github.com/googleapis/python-bigquery-dataframes/commit/92a1af35b8de4afb6cdb5b5e89facdceb5c151d2)) +* Add the code samples for metrics{auc, roc_auc_score, roc_curve} ([#520](https://github.com/googleapis/python-bigquery-dataframes/issues/520)) ([5f37b09](https://github.com/googleapis/python-bigquery-dataframes/commit/5f37b0902fae2c099207acf3ce2e251c09ac889d)) +* Address more comments from technical writers to meet legal purposes ([#571](https://github.com/googleapis/python-bigquery-dataframes/issues/571)) ([9084df3](https://github.com/googleapis/python-bigquery-dataframes/commit/9084df369bc6819edf5f57ceba85667a14371ac5)) +* Fix docs of ARIMAPlus.predict ([#512](https://github.com/googleapis/python-bigquery-dataframes/issues/512)) ([3b80f95](https://github.com/googleapis/python-bigquery-dataframes/commit/3b80f956755c9d7043138aab6e5687cba50be8cb)) +* Include Index in table-of-contents ([#564](https://github.com/googleapis/python-bigquery-dataframes/issues/564)) ([b188146](https://github.com/googleapis/python-bigquery-dataframes/commit/b188146466780e6f7a041f51f5be51a7d60719c9)) +* Mark Gemini model as Pre-GA ([#543](https://github.com/googleapis/python-bigquery-dataframes/issues/543)) ([769868b](https://github.com/googleapis/python-bigquery-dataframes/commit/769868b9fc7dfff2e7b1ed5cec52a5dd3dfd6ff2)) +* Migrate the overview page to Bigframes official landing page ([#536](https://github.com/googleapis/python-bigquery-dataframes/issues/536)) ([a0fb8bb](https://github.com/googleapis/python-bigquery-dataframes/commit/a0fb8bbfddd07f1e0ef03eeb4be653d1e9f06772)) + ## [1.0.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.26.0...v1.0.0) (2024-03-25) diff --git a/README.rst b/README.rst index 73709641de..64d1e4e72c 100644 --- a/README.rst +++ b/README.rst @@ -1,6 +1,8 @@ BigQuery DataFrames =================== +|GA| |pypi| |versions| + BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine. @@ -10,395 +12,25 @@ powered by the BigQuery engine. BigQuery DataFrames is an open-source package. You can run ``pip install --upgrade bigframes`` to install the latest version. +.. |GA| image:: https://img.shields.io/badge/support-GA-gold.svg + :target: https://github.com/googleapis/google-cloud-python/blob/main/README.rst#general-availability +.. |pypi| image:: https://img.shields.io/pypi/v/bigframes.svg + :target: https://pypi.org/project/bigframes/ +.. |versions| image:: https://img.shields.io/pypi/pyversions/bigframes.svg + :target: https://pypi.org/project/bigframes/ + Documentation ------------- * `BigQuery DataFrames source code (GitHub) `_ * `BigQuery DataFrames sample notebooks `_ * `BigQuery DataFrames API reference `_ -* `BigQuery documentation `_ - - -Quickstart ----------- - -Prerequisites -^^^^^^^^^^^^^ - -* Install the ``bigframes`` package. -* Create a Google Cloud project and billing account. -* In an interactive environment (like Notebook, Python REPL or command line), - ``bigframes`` will do the authentication on-the-fly if needed. Otherwise, see - `how to set up application default credentials `_ - for various environments. For example, to pre-authenticate on your laptop you can - `install and initialize the gcloud CLI `_, - and then generate the application default credentials by doing - `gcloud auth application-default login `_. -* The user must have - `BigQuery Job User `_ and - `BigQuery Read Session User `_ - roles for the minimum usage. Additional IAM requirements apply for using remote - functions and ML. - -Code sample -^^^^^^^^^^^ - -Import ``bigframes.pandas`` for a pandas-like interface. The ``read_gbq`` -method accepts either a fully-qualified table ID or a SQL query. - -.. code-block:: python - - import bigframes.pandas as bpd - - bpd.options.bigquery.project = your_gcp_project_id - df1 = bpd.read_gbq("project.dataset.table") - df2 = bpd.read_gbq("SELECT a, b, c, FROM `project.dataset.table`") - -* `More code samples `_ - - -Locations ---------- -BigQuery DataFrames uses a -`BigQuery session `_ -internally to manage metadata on the service side. This session is tied to a -`location `_ . -BigQuery DataFrames uses the US multi-region as the default location, but you -can use ``session_options.location`` to set a different location. Every query -in a session is executed in the location where the session was created. -BigQuery DataFrames -auto-populates ``bf.options.bigquery.location`` if the user starts with -``read_gbq/read_gbq_table/read_gbq_query()`` and specifies a table, either -directly or in a SQL statement. - -If you want to reset the location of the created DataFrame or Series objects, -you can close the session by executing ``bigframes.pandas.close_session()``. -After that, you can reuse ``bigframes.pandas.options.bigquery.location`` to -specify another location. - - -``read_gbq()`` requires you to specify a location if the dataset you are -querying is not in the US multi-region. If you try to read a table from another -location, you get a NotFound exception. - -Project -------- -If ``bf.options.bigquery.project`` is not set, the ``$GOOGLE_CLOUD_PROJECT`` -environment variable is used, which is set in the notebook runtime serving the -BigQuery Studio/Vertex Notebooks. - -ML Capabilities ---------------- - -The ML capabilities in BigQuery DataFrames let you preprocess data, and -then train models on that data. You can also chain these actions together to -create data pipelines. - -Preprocess data -^^^^^^^^^^^^^^^^^^^^^^^^ - -Create transformers to prepare data for use in estimators (models) by -using the -`bigframes.ml.preprocessing module `_ -and the `bigframes.ml.compose module `_. -BigQuery DataFrames offers the following transformations: - -* Use the `KBinsDiscretizer class `_ - in the ``bigframes.ml.preprocessing`` module to bin continuous data into intervals. -* Use the `LabelEncoder class `_ - in the ``bigframes.ml.preprocessing`` module to normalize the target labels as integer values. -* Use the `MaxAbsScaler class `_ - in the ``bigframes.ml.preprocessing`` module to scale each feature to the range ``[-1, 1]`` by its maximum absolute value. -* Use the `MinMaxScaler class `_ - in the ``bigframes.ml.preprocessing`` module to standardize features by scaling each feature to the range ``[0, 1]``. -* Use the `StandardScaler class `_ - in the ``bigframes.ml.preprocessing`` module to standardize features by removing the mean and scaling to unit variance. -* Use the `OneHotEncoder class `_ - in the ``bigframes.ml.preprocessing`` module to transform categorical values into numeric format. -* Use the `ColumnTransformer class `_ - in the ``bigframes.ml.compose`` module to apply transformers to DataFrames columns. - - -Train models -^^^^^^^^^^^^ - -Create estimators to train models in BigQuery DataFrames. - -**Clustering models** - -Create estimators for clustering models by using the -`bigframes.ml.cluster module `_. - -* Use the `KMeans class `_ - to create K-means clustering models. Use these models for - data segmentation. For example, identifying customer segments. K-means is an - unsupervised learning technique, so model training doesn't require labels or split - data for training or evaluation. - -**Decomposition models** - -Create estimators for decomposition models by using the `bigframes.ml.decomposition module `_. - -* Use the `PCA class `_ - to create principal component analysis (PCA) models. Use these - models for computing principal components and using them to perform a change of - basis on the data. This provides dimensionality reduction by projecting each data - point onto only the first few principal components to obtain lower-dimensional - data while preserving as much of the data's variation as possible. - - -**Ensemble models** - -Create estimators for ensemble models by using the `bigframes.ml.ensemble module `_. - -* Use the `RandomForestClassifier class `_ - to create random forest classifier models. Use these models for constructing multiple - learning method decision trees for classification. -* Use the `RandomForestRegressor class `_ - to create random forest regression models. Use - these models for constructing multiple learning method decision trees for regression. -* Use the `XGBClassifier class `_ - to create gradient boosted tree classifier models. Use these models for additively - constructing multiple learning method decision trees for classification. -* Use the `XGBRegressor class `_ - to create gradient boosted tree regression models. Use these models for additively - constructing multiple learning method decision trees for regression. - - -**Forecasting models** - -Create estimators for forecasting models by using the `bigframes.ml.forecasting module `_. - -* Use the `ARIMAPlus class `_ - to create time series forecasting models. - -**Imported models** - -Create estimators for imported models by using the `bigframes.ml.imported module `_. - -* Use the `ONNXModel class `_ - to import Open Neural Network Exchange (ONNX) models. -* Use the `TensorFlowModel class `_ - to import TensorFlow models. -* Use the `XGBoostModel class `_ - to import XGBoostModel models. - -**Linear models** - -Create estimators for linear models by using the `bigframes.ml.linear_model module `_. - -* Use the `LinearRegression class `_ - to create linear regression models. Use these models for forecasting. For example, - forecasting the sales of an item on a given day. -* Use the `LogisticRegression class `_ - to create logistic regression models. Use these models for the classification of two - or more possible values such as whether an input is ``low-value``, ``medium-value``, - or ``high-value``. - -**Large language models** - -Create estimators for LLMs by using the `bigframes.ml.llm module `_. - -* Use the `GeminiTextGenerator class `_ to create Gemini text generator models. Use these models - for text generation tasks. -* Use the `PaLM2TextGenerator class `_ to create PaLM2 text generator models. Use these models - for text generation tasks. -* Use the `PaLM2TextEmbeddingGenerator class `_ to create PaLM2 text embedding generator models. - Use these models for text embedding generation tasks. - - -Create pipelines -^^^^^^^^^^^^^^^^ - -Create ML pipelines by using -`bigframes.ml.pipeline module `_. -Pipelines let you assemble several ML steps to be cross-validated together while setting -different parameters. This simplifies your code, and allows you to deploy data preprocessing -steps and an estimator together. - -* Use the `Pipeline class `_ - to create a pipeline of transforms with a final estimator. - - -ML remote models ----------------- - -**Requirements** - -To use BigQuery DataFrames ML remote models (`bigframes.ml.remote` or `bigframes.ml.llm`), -you must enable the following APIs: - -* The BigQuery API (bigquery.googleapis.com) -* The BigQuery Connection API (bigqueryconnection.googleapis.com) -* The Vertex AI API (aiplatform.googleapis.com) - -and you must be granted the following IAM roles in the project: - -* BigQuery Data Editor (roles/bigquery.dataEditor) -* BigQuery Connection Admin (roles/bigquery.connectionAdmin) -* Service Account User (roles/iam.serviceAccountUser) -* Vertex AI User (roles/aiplatform.user) -* Project IAM Admin (roles/resourcemanager.projectIamAdmin) if using default - BigQuery connection, or Browser (roles/browser) if using a pre-configured connection. - This requirement can be avoided by setting - ``bigframes.pandas.options.bigquery.skip_bq_connection_check`` option to ``True``, - in which case the connection (default or pre-configured) would be - used as-is without any existence or permission check. - - -ML locations ------------- - -``bigframes.ml`` supports the same locations as BigQuery ML. BigQuery ML model -prediction and other ML functions are supported in all BigQuery regions. Support -for model training varies by region. For more information, see -`BigQuery ML locations `_. - - -Data types ----------- - -BigQuery DataFrames supports the following numpy and pandas dtypes: - -* ``numpy.dtype("O")`` -* ``pandas.BooleanDtype()`` -* ``pandas.Float64Dtype()`` -* ``pandas.Int64Dtype()`` -* ``pandas.StringDtype(storage="pyarrow")`` -* ``pandas.ArrowDtype(pa.date32())`` -* ``pandas.ArrowDtype(pa.time64("us"))`` -* ``pandas.ArrowDtype(pa.timestamp("us"))`` -* ``pandas.ArrowDtype(pa.timestamp("us", tz="UTC"))`` - -BigQuery DataFrames doesn’t support the following BigQuery data types: - -* ``ARRAY`` -* ``NUMERIC`` -* ``BIGNUMERIC`` -* ``INTERVAL`` -* ``STRUCT`` -* ``JSON`` - -All other BigQuery data types display as the object type. - - -Remote functions ----------------- - -BigQuery DataFrames gives you the ability to turn your custom scalar functions -into `BigQuery remote functions -`_ . Creating a remote -function in BigQuery DataFrames (See `code samples -`_) -creates: - -1. A `Cloud Functions (2nd gen) function `_. -2. A `BigQuery connection `_. - If the BigQuery connection is created, the BigQuery service will - create a - `Google Cloud-managed IAM service account `_ - and attach it to the connection. You can use a pre-configured BigQuery - connection if you prefer, in which case the connection creation is skipped. -3. A BigQuery remote function that talks to the cloud function (1) using the BigQuery - connection (2). - -BigQuery connections are created in the same location as the BigQuery -DataFrames session, using the name you provide in the custom function -definition. To view and manage connections, do the following: - -1. Go to `BigQuery in the Google Cloud Console `__. -2. Select the project in which you created the remote function. -3. In the Explorer pane, expand that project and then expand External connections. - -BigQuery remote functions are created in the dataset you specify, or -in a special type of `hidden dataset `__ -referred to as an anonymous dataset. To view and manage remote functions created -in a user provided dataset, do the following: - -1. Go to `BigQuery in the Google Cloud Console `__. -2. Select the project in which you created the remote function. -3. In the Explorer pane, expand that project, expand the dataset in which you - created the remote function, and then expand Routines. - -To view and manage Cloud Functions functions, use the -`Functions `_ -page and use the project picker to select the project in which you -created the function. For easy identification, the names of the functions -created by BigQuery DataFrames are prefixed by ``bigframes``. - -**Requirements** - -To use BigQuery DataFrames remote functions, you must enable the following APIs: - -* The BigQuery API (bigquery.googleapis.com) -* The BigQuery Connection API (bigqueryconnection.googleapis.com) -* The Cloud Functions API (cloudfunctions.googleapis.com) -* The Cloud Run API (run.googleapis.com) -* The Artifact Registry API (artifactregistry.googleapis.com) -* The Cloud Build API (cloudbuild.googleapis.com ) -* The Cloud Resource Manager API (cloudresourcemanager.googleapis.com) - -To use BigQuery DataFrames remote functions, you must be granted the -following IAM roles in the project: - -* BigQuery Data Editor (roles/bigquery.dataEditor) -* BigQuery Connection Admin (roles/bigquery.connectionAdmin) -* Cloud Functions Developer (roles/cloudfunctions.developer) -* Service Account User (roles/iam.serviceAccountUser) -* Storage Object Viewer (roles/storage.objectViewer) -* Project IAM Admin (roles/resourcemanager.projectIamAdmin) if using default - BigQuery connection, or Browser (roles/browser) if using a pre-configured connection. - This requirement can be avoided by setting - ``bigframes.pandas.options.bigquery.skip_bq_connection_check`` option to ``True``, - in which case the connection (default or pre-configured) would be - used as-is without any existence or permission check. - -**Limitations** - -* Remote functions take about 90 seconds to become available when you first create them. -* Trivial changes in the notebook, such as inserting a new cell or renaming a variable, - might cause the remote function to be re-created, even if these changes are unrelated - to the remote function code. -* BigQuery DataFrames does not differentiate any personal data you include in the remote - function code. The remote function code is serialized as an opaque box to deploy it as a - Cloud Functions function. -* The Cloud Functions (2nd gen) functions, BigQuery connections, and BigQuery remote - functions created by BigQuery DataFrames persist in Google Cloud. If you don’t want to - keep these resources, you must delete them separately using an appropriate Cloud Functions - or BigQuery interface. -* A project can have up to 1000 Cloud Functions (2nd gen) functions at a time. See Cloud - Functions quotas for all the limits. - - -Quotas and limits ------------------- - -`BigQuery quotas `_ -including hardware, software, and network components. - - -Session termination -------------------- - -Each BigQuery DataFrames DataFrame or Series object is tied to a BigQuery -DataFrames session, which is in turn based on a BigQuery session. BigQuery -sessions -`auto-terminate `_ -; when this happens, you can’t use previously -created DataFrame or Series objects and must re-create them using a new -BigQuery DataFrames session. You can do this by running -``bigframes.pandas.close_session()`` and then re-running the BigQuery -DataFrames expressions. - -Data processing location ------------------------- -BigQuery DataFrames is designed for scale, which it achieves by keeping data -and processing on the BigQuery service. However, you can bring data into the -memory of your client machine by calling ``.to_pandas()`` on a DataFrame or Series -object. If you choose to do this, the memory limitation of your client machine -applies. +Getting started with BigQuery DataFrames +---------------------------------------- +Try the `BigQuery DataFrames quickstart `_ +to get up and running in just a few minutes. License diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index d035fe5df1..50e14eaf28 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -56,7 +56,8 @@ def __init__( def application_name(self) -> Optional[str]: """The application name to amend to the user-agent sent to Google APIs. - Recommended format is ``"appplication-name/major.minor.patch_version"`` + The application name to amend to the user agent sent to Google APIs. + The recommended format is ``"application-name/major.minor.patch_version"`` or ``"(gpn:PartnerName;)"`` for official Google partners. """ return self._application_name @@ -71,7 +72,7 @@ def application_name(self, value: Optional[str]): @property def credentials(self) -> Optional[google.auth.credentials.Credentials]: - """The OAuth2 Credentials to use for this client.""" + """The OAuth2 credentials to use for this client.""" return self._credentials @credentials.setter @@ -84,7 +85,7 @@ def credentials(self, value: Optional[google.auth.credentials.Credentials]): def location(self) -> Optional[str]: """Default location for job, datasets, and tables. - See: https://cloud.google.com/bigquery/docs/locations + For more information, see https://cloud.google.com/bigquery/docs/locations BigQuery locations. """ return self._location @@ -107,13 +108,15 @@ def project(self, value: Optional[str]): @property def bq_connection(self) -> Optional[str]: - """Name of the BigQuery connection to use. Should be of the form + """Name of the BigQuery connection to use in the form ... - You should either have the connection already created in the - location you have chosen, or you should have the Project IAM - Admin role to enable the service to create the connection for you if you - need it. + You either need to create the connection in a location of your choice, or + you need the Project Admin IAM role to enable the service to create the + connection for you. + + If this option isn't available, or the project or location isn't provided, + then the default connection project/location/connection_id is used in the session. If this option isn't provided, or project or location aren't provided, session will use its default project/location/connection_id as default connection. @@ -151,12 +154,12 @@ def use_regional_endpoints(self) -> bool: """Flag to connect to regional API endpoints. .. deprecated:: 0.13.0 - Use of regional endpoints is a feature in preview and + Use of regional endpoints is a feature in Preview and available only in selected regions and projects. - Requires ``location`` to also be set. For example, set - ``location='asia-northeast1'`` and ``use_regional_endpoints=True`` to - connect to asia-northeast1-bigquery.googleapis.com. + Requires that ``location`` is set. For example, to connect to + asia-northeast1-bigquery.googleapis.com, specify + ``location='asia-northeast1'`` and ``use_regional_endpoints=True``. """ return self._use_regional_endpoints @@ -177,17 +180,22 @@ def use_regional_endpoints(self, value: bool): @property def kms_key_name(self) -> Optional[str]: - """Customer managed encryption key used to control encryption of the + """ + Customer-managed encryption key + used to control encryption of the data at rest in BigQuery. This key + takes the format projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY + + Customer managed encryption key used to control encryption of the data-at-rest in BigQuery. This is of the format projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY - See https://cloud.google.com/bigquery/docs/customer-managed-encryption - for more details. + For more information, see https://cloud.google.com/bigquery/docs/customer-managed-encryption + Customer-managed Cloud KMS keys - Please make sure the project used for Bigquery DataFrames has "Cloud KMS - CryptoKey Encrypter/Decrypter" role in the key's project, See - https://cloud.google.com/bigquery/docs/customer-managed-encryption#assign_role - for steps on how to ensure that. + Make sure the project used for Bigquery DataFrames has the + Cloud KMS CryptoKey Encrypter/Decrypter IAM role in the key's project. + For more information, see https://cloud.google.com/bigquery/docs/customer-managed-encryption#assign_role + Assign the Encrypter/Decrypter. """ return self._kms_key_name diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index fb708b844c..2b849c558a 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -40,7 +40,11 @@ class ComputeOptions: bytes billed beyond this limit will fail (without incurring a charge). If unspecified, this will be set to your project default. See `maximum_bytes_billed `_. - + enable_multi_query_execution (bool, Options): + If enabled, large queries may be factored into multiple smaller queries + in order to avoid generating queries that are too complex for the query + engine to handle. However this comes at the cost of increase cost and latency. """ maximum_bytes_billed: Optional[int] = None + enable_multi_query_execution: bool = False diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 6fd6fc23c2..9358dab1b1 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -229,6 +229,10 @@ def assign_constant( value: typing.Any, dtype: typing.Optional[bigframes.dtypes.Dtype], ) -> ArrayValue: + if pandas.isna(value): + # Need to assign a data type when value is NaN. + dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE + if destination_id in self.column_ids: # Mutate case exprs = [ ( @@ -397,6 +401,15 @@ def join( return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node)) return ArrayValue(join_node) + def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: + assert len(column_ids) > 0 + for column_id in column_ids: + assert bigframes.dtypes.is_array_like(self.get_column_type(column_id)) + + return ArrayValue( + nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids)) + ) + def _uniform_sampling(self, fraction: float) -> ArrayValue: """Sampling the table on given fraction. diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index afa13375b1..c7b41e93eb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -41,6 +41,7 @@ import bigframes.core.guid as guid import bigframes.core.join_def as join_defs import bigframes.core.ordering as ordering +import bigframes.core.tree_properties as tree_properties import bigframes.core.utils import bigframes.core.utils as utils import bigframes.dtypes @@ -443,8 +444,10 @@ def to_pandas( df.set_axis(self.column_labels, axis=1, copy=False) return df, query_job - def try_peek(self, n: int = 20) -> typing.Optional[pd.DataFrame]: - if self.expr.node.peekable: + def try_peek( + self, n: int = 20, force: bool = False + ) -> typing.Optional[pd.DataFrame]: + if force or tree_properties.peekable(self.expr.node): iterator, _ = self.session._peek(self.expr, n) df = self._to_dataframe(iterator) self._copy_index_to_pandas(df) @@ -1159,6 +1162,36 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()): index_labels=self.column_labels.names, ) + def explode( + self, + column_ids: typing.Sequence[str], + ignore_index: Optional[bool], + ) -> Block: + column_ids = [ + column_id + for column_id in column_ids + if bigframes.dtypes.is_array_like(self.expr.get_column_type(column_id)) + ] + if len(column_ids) == 0: + expr = self.expr + else: + expr = self.expr.explode(column_ids) + + if ignore_index: + return Block( + expr.drop_columns(self.index_columns), + column_labels=self.column_labels, + # Initiates default index creation using the block constructor. + index_columns=[], + ) + else: + return Block( + expr, + column_labels=self.column_labels, + index_columns=self.index_columns, + index_labels=self.column_labels.names, + ) + def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]: """ Gets a standard set of stats to preemptively fetch for a column if @@ -1311,8 +1344,8 @@ def retrieve_repr_request_results( head_block = self computed_df, query_job = head_block.to_pandas() formatted_df = computed_df.set_axis(self.column_labels, axis=1) - # we reset the axis and substitute the bf index name for the default - formatted_df.index.name = self.index.name + # we reset the axis and substitute the bf index name(s) for the default + formatted_df.index.names = self.index.names # type: ignore return formatted_df, count, query_job def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]: @@ -1840,6 +1873,10 @@ def cached(self, *, optimize_offsets=False, force: bool = False) -> Block: expr = self.session._cache_with_cluster_cols( self.expr, cluster_cols=self.index_columns ) + return self.swap_array_expr(expr) + + def swap_array_expr(self, expr: core.ArrayValue) -> Block: + # TODO: Validate schema unchanged return Block( expr, index_columns=self.index_columns, diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 9c1db0f162..ae21243506 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -190,7 +190,7 @@ def _( .else_(magnitude * pow(-1, negative_count_parity)) .end() ) - return float_result.cast(column.type()) # type: ignore + return float_result @compile_unary_agg.register diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index af2d69275a..f1c5d62010 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -20,6 +20,7 @@ import typing from typing import Collection, Iterable, Literal, Optional, Sequence +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.backends.bigquery as ibis_bigquery import ibis.common.deferred # type: ignore @@ -502,6 +503,51 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR: columns=columns, ) + def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR: + table = self._to_ibis_expr() + + # The offset array ensures null represents empty arrays after unnesting. + offset_array_id = bigframes.core.guid.generate_guid("offset_array_") + offset_array = ( + vendored_ibis_ops.GenerateArray( + ibis.greatest( + 0, + ibis.least( + *[table[column_id].length() - 1 for column_id in column_ids] + ), + ) + ) + .to_expr() + .name(offset_array_id), + ) + table_w_offset_array = table.select( + offset_array, + *self._column_names, + ) + + unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_") + unnest_offset = ( + table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) + ) + table_w_offset = table_w_offset_array.select( + unnest_offset, + *self._column_names, + ) + + unnested_columns = [ + table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) + if column_id in column_ids + else table_w_offset[column_id] + for column_id in self._column_names + ] + table_w_unnest = table_w_offset.select(*unnested_columns) + + columns = [table_w_unnest[column_name] for column_name in self._column_names] + return UnorderedIR( + table_w_unnest, + columns=columns, + ) + ## Helpers def _set_or_replace_by_id( self, id: str, new_value: ibis_types.Value @@ -719,6 +765,78 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR: ordering=self._ordering, ) + def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR: + table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + + offset_array_id = bigframes.core.guid.generate_guid("offset_array_") + offset_array = ( + vendored_ibis_ops.GenerateArray( + ibis.greatest( + 0, + ibis.least( + *[table[column_id].length() - 1 for column_id in column_ids] + ), + ) + ) + .to_expr() + .name(offset_array_id), + ) + table_w_offset_array = table.select( + offset_array, + *self._column_names, + *self._hidden_ordering_column_names, + ) + + unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_") + unnest_offset = ( + table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) + ) + table_w_offset = table_w_offset_array.select( + unnest_offset, + *self._column_names, + *self._hidden_ordering_column_names, + ) + + unnested_columns = [ + table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) + if column_id in column_ids + else table_w_offset[column_id] + for column_id in self._column_names + ] + + table_w_unnest = table_w_offset.select( + table_w_offset[unnest_offset_id], + *unnested_columns, + *self._hidden_ordering_column_names, + ) + + columns = [table_w_unnest[column_name] for column_name in self._column_names] + hidden_ordering_columns = [ + *[ + table_w_unnest[column_name] + for column_name in self._hidden_ordering_column_names + ], + table_w_unnest[unnest_offset_id], + ] + ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + *self._ordering.ordering_value_columns, + ascending_over(unnest_offset_id), + ] + ), + total_ordering_columns=frozenset( + [*self._ordering.total_ordering_columns, unnest_offset_id] + ), + ) + + return OrderedIR( + table_w_unnest, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=ordering, + ) + def promote_offsets(self, col_id: str) -> OrderedIR: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 6f10d85f31..638e3eacdd 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -191,6 +191,11 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): ) +@_compile_node.register +def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True): + return compile_node(node.child, ordered).explode(node.column_ids) + + @_compile_node.register def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index d2fc453835..5c165fa1df 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -257,6 +257,13 @@ def arctan_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).atan() +@scalar_op_compiler.register_binary_op(ops.arctan2_op) +def arctan2_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).atan2( + typing.cast(ibis_types.NumericValue, y) + ) + + # Hyperbolic trig functions # BQ has these functions, but Ibis doesn't @scalar_op_compiler.register_unary_op(ops.sinh_op) @@ -319,6 +326,30 @@ def arctanh_op_impl(x: ibis_types.Value): # Numeric Ops +@scalar_op_compiler.register_unary_op(ops.floor_op) +def floor_op_impl(x: ibis_types.Value): + x_numeric = typing.cast(ibis_types.NumericValue, x) + if x_numeric.type().is_integer(): + return x_numeric.cast(ibis_dtypes.Float64()) + if x_numeric.type().is_floating(): + # Default ibis impl tries to cast to integer, which doesn't match pandas and can overflow + return float_floor(x_numeric) + else: # numeric + return x_numeric.floor() + + +@scalar_op_compiler.register_unary_op(ops.ceil_op) +def ceil_op_impl(x: ibis_types.Value): + x_numeric = typing.cast(ibis_types.NumericValue, x) + if x_numeric.type().is_integer(): + return x_numeric.cast(ibis_dtypes.Float64()) + if x_numeric.type().is_floating(): + # Default ibis impl tries to cast to integer, which doesn't match pandas and can overflow + return float_ceil(x_numeric) + else: # numeric + return x_numeric.ceil() + + @scalar_op_compiler.register_unary_op(ops.abs_op) def abs_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).abs() @@ -347,6 +378,11 @@ def ln_op_impl(x: ibis_types.Value): return (~domain).ifelse(out_of_domain, numeric_value.ln()) +@scalar_op_compiler.register_unary_op(ops.log1p_op) +def log1p_op_impl(x: ibis_types.Value): + return ln_op_impl(_ibis_num(1) + x) + + @scalar_op_compiler.register_unary_op(ops.exp_op) def exp_op_impl(x: ibis_types.Value): numeric_value = typing.cast(ibis_types.NumericValue, x) @@ -354,6 +390,11 @@ def exp_op_impl(x: ibis_types.Value): return (~domain).ifelse(_INF, numeric_value.exp()) +@scalar_op_compiler.register_unary_op(ops.expm1_op) +def expm1_op_impl(x: ibis_types.Value): + return exp_op_impl(x) - _ibis_num(1) + + @scalar_op_compiler.register_unary_op(ops.invert_op) def invert_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).negate() @@ -1318,3 +1359,16 @@ def _ibis_num(number: float): @ibis.udf.scalar.builtin def timestamp(a: str) -> ibis_dtypes.timestamp: """Convert string to timestamp.""" + + +# Need these because ibis otherwise tries to do casts to int that can fail +@ibis.udf.scalar.builtin(name="floor") +def float_floor(a: float) -> float: + """Convert string to timestamp.""" + return 0 # pragma: NO COVER + + +@ibis.udf.scalar.builtin(name="ceil") +def float_ceil(a: float) -> float: + """Convert string to timestamp.""" + return 0 # pragma: NO COVER diff --git a/bigframes/core/convert.py b/bigframes/core/convert.py new file mode 100644 index 0000000000..98f854ad72 --- /dev/null +++ b/bigframes/core/convert.py @@ -0,0 +1,49 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import pandas as pd + +import bigframes.core.indexes as index +import bigframes.series as series + + +def to_bf_series(obj, default_index: index.Index) -> series.Series: + if isinstance(obj, series.Series): + return obj + if isinstance(obj, pd.Series): + return series.Series(obj) + if isinstance(obj, index.Index): + return series.Series(obj, default_index) + if isinstance(obj, pd.Index): + return series.Series(obj, default_index) + if pd.api.types.is_list_like(obj): + return series.Series(obj, default_index) + else: + raise TypeError(f"Cannot interpret {obj} as series.") + + +def to_pd_series(obj, default_index: pd.Index) -> pd.Series: + if isinstance(obj, series.Series): + return obj.to_pandas() + if isinstance(obj, pd.Series): + return obj + if isinstance(obj, index.Index): + return pd.Series(obj.to_pandas(), default_index) + if isinstance(obj, pd.Index): + return pd.Series(obj, default_index) + if pd.api.types.is_list_like(obj): + return pd.Series(obj, default_index) + else: + raise TypeError(f"Cannot interpret {obj} as series.") diff --git a/bigframes/core/eval.py b/bigframes/core/eval.py new file mode 100644 index 0000000000..692ca1c7bb --- /dev/null +++ b/bigframes/core/eval.py @@ -0,0 +1,71 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +from typing import Optional + +import bigframes_vendored.pandas.core.computation.eval as vendored_pandas_eval +import bigframes_vendored.pandas.core.computation.parsing as vendored_pandas_eval_parsing + +import bigframes.dataframe as dataframe +import bigframes.dtypes +import bigframes.series as series + + +def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFrame]): + """ + Evaluate the given python expression + + Args: + df (DataFrame): + Columns of this dataframe will be used to resolve variables in expression. + expr (str): + One or more python expression to evaluate. + target (DataFrame or None): + The evaluation result will be written to the target if provided. + + Returns: + Result of evaluation. + """ + index_resolver = { + vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries( + df.index.get_level_values(level).to_series() + ) + for level, name in enumerate(df.index.names) + } + column_resolver = { + vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries(series) + for name, series in df.items() + } + # 3 Levels: user -> logging wrapper -> dataframe -> eval helper (this) + return vendored_pandas_eval.eval( + expr=expr, level=3, target=target, resolvers=(index_resolver, column_resolver) # type: ignore + ) + + +@dataclasses.dataclass +class FakeNumpyArray: + dtype: bigframes.dtypes.Dtype + + +class EvalSeries(series.Series): + """Slight modified series that works better with pandas.eval""" + + def __init__(self, underlying: series.Series): + super().__init__(data=underlying._block) + + @property + def values(self): + """Returns fake numpy array with only dtype property so that eval can determine schema without actually downloading the data.""" + return FakeNumpyArray(self.dtype) diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 8c3f52d22b..4980f5369d 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -108,6 +108,11 @@ def bind_all_variables(self, bindings: Mapping[str, Expression]) -> Expression: def is_bijective(self) -> bool: return False + @property + def is_identity(self) -> bool: + """True for identity operation that does not transform input.""" + return False + @dataclasses.dataclass(frozen=True) class ScalarConstantExpression(Expression): @@ -173,6 +178,10 @@ def bind_all_variables(self, bindings: Mapping[str, Expression]) -> Expression: def is_bijective(self) -> bool: return True + @property + def is_identity(self) -> bool: + return True + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index 6419d0985a..ae6011ffa5 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigframes.core.indexes.index import Index +from bigframes.core.indexes.base import Index __all__ = [ "Index", diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/base.py similarity index 98% rename from bigframes/core/indexes/index.py rename to bigframes/core/indexes/base.py index c818b68711..daa52a02b9 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/base.py @@ -88,7 +88,12 @@ def from_frame( @property def name(self) -> blocks.Label: - return self.names[0] + names = self.names + if len(names) == 1: + return self.names[0] + else: + # pandas returns None for MultiIndex.name. + return None @name.setter def name(self, value: blocks.Label): @@ -460,14 +465,6 @@ def __init__( super().__init__(series_or_dataframe._block) self._whole_frame = series_or_dataframe - @property - def name(self) -> blocks.Label: - return self.names[0] - - @name.setter - def name(self, value: blocks.Label): - self.names = [value] - @property def names(self) -> typing.Sequence[blocks.Label]: """Returns the names of the Index.""" diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 5ebd2a5997..a1072b0d68 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -15,11 +15,11 @@ from __future__ import annotations import abc -from dataclasses import dataclass, field, fields +from dataclasses import dataclass, field, fields, replace import functools import itertools import typing -from typing import Tuple +from typing import Callable, Tuple import pandas @@ -39,6 +39,10 @@ import bigframes.session +# A fixed number of variable to assume for overhead on some operations +OVERHEAD_VARIABLES = 5 + + @dataclass(frozen=True) class BigFrameNode: """ @@ -90,11 +94,6 @@ def session(self): def _node_hash(self): return hash(tuple(hash(getattr(self, field.name)) for field in fields(self))) - @property - def peekable(self) -> bool: - """Indicates whether the node can be sampled efficiently""" - return all(child.peekable for child in self.child_nodes) - @property def roots(self) -> typing.Set[BigFrameNode]: roots = itertools.chain.from_iterable( @@ -107,6 +106,60 @@ def roots(self) -> typing.Set[BigFrameNode]: def schema(self) -> schemata.ArraySchema: ... + @property + @abc.abstractmethod + def variables_introduced(self) -> int: + """ + Defines number of values created by the current node. Helps represent the "width" of a query + """ + ... + + @property + def relation_ops_created(self) -> int: + """ + Defines the number of relational ops generated by the current node. Used to estimate query planning complexity. + """ + return 1 + + @property + def joins(self) -> bool: + """ + Defines whether the node joins data. + """ + return False + + @functools.cached_property + def total_variables(self) -> int: + return self.variables_introduced + sum( + map(lambda x: x.total_variables, self.child_nodes) + ) + + @functools.cached_property + def total_relational_ops(self) -> int: + return self.relation_ops_created + sum( + map(lambda x: x.total_relational_ops, self.child_nodes) + ) + + @functools.cached_property + def total_joins(self) -> int: + return int(self.joins) + sum(map(lambda x: x.total_joins, self.child_nodes)) + + @property + def planning_complexity(self) -> int: + """ + Empirical heuristic measure of planning complexity. + + Used to determine when to decompose overly complex computations. May require tuning. + """ + return self.total_variables * self.total_relational_ops * (1 + self.total_joins) + + @abc.abstractmethod + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + """Apply a function to each child node.""" + ... + @dataclass(frozen=True) class UnaryNode(BigFrameNode): @@ -120,6 +173,11 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def schema(self) -> schemata.ArraySchema: return self.child.schema + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return replace(self, child=t(self.child)) + @dataclass(frozen=True) class JoinNode(BigFrameNode): @@ -143,12 +201,6 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - children_peekable = all(child.peekable for child in self.child_nodes) - single_root = len(self.roots) == 1 - return children_peekable and single_root - @functools.cached_property def schema(self) -> schemata.ArraySchema: def join_mapping_to_schema_item(mapping: JoinColumnMapping): @@ -165,6 +217,22 @@ def join_mapping_to_schema_item(mapping: JoinColumnMapping): ) return schemata.ArraySchema(items) + @functools.cached_property + def variables_introduced(self) -> int: + """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" + return OVERHEAD_VARIABLES + + @property + def joins(self) -> bool: + return True + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return replace( + self, left_child=t(self.left_child), right_child=t(self.right_child) + ) + @dataclass(frozen=True) class ConcatNode(BigFrameNode): @@ -193,6 +261,16 @@ def schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(items) + @functools.cached_property + def variables_introduced(self) -> int: + """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" + return len(self.schema.items) + OVERHEAD_VARIABLES + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return replace(self, children=tuple(t(child) for child in self.children)) + # Input Nodex @dataclass(frozen=True) @@ -204,10 +282,6 @@ class ReadLocalNode(BigFrameNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return True - @property def roots(self) -> typing.Set[BigFrameNode]: return {self} @@ -216,6 +290,16 @@ def roots(self) -> typing.Set[BigFrameNode]: def schema(self) -> schemata.ArraySchema: return self.data_schema + @functools.cached_property + def variables_introduced(self) -> int: + """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" + return len(self.schema.items) + 1 + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return self + # TODO: Refactor to take raw gbq object reference @dataclass(frozen=True) @@ -233,10 +317,6 @@ def session(self): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return True - @property def roots(self) -> typing.Set[BigFrameNode]: return {self} @@ -252,6 +332,20 @@ def schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(items) + @functools.cached_property + def variables_introduced(self) -> int: + return len(self.columns) + len(self.hidden_ordering_columns) + + @property + def relation_ops_created(self) -> int: + # Assume worst case, where readgbq actually has baked in analytic operation to generate index + return 2 + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + return self + # Unary nodes @dataclass(frozen=True) @@ -261,13 +355,9 @@ class PromoteOffsetsNode(UnaryNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: - return False + return True @property def schema(self) -> schemata.ArraySchema: @@ -275,6 +365,14 @@ def schema(self) -> schemata.ArraySchema: schemata.SchemaItem(self.col_id, bigframes.dtypes.INT_DTYPE) ) + @property + def relation_ops_created(self) -> int: + return 2 + + @functools.cached_property + def variables_introduced(self) -> int: + return 1 + @dataclass(frozen=True) class FilterNode(UnaryNode): @@ -287,6 +385,10 @@ def row_preserving(self) -> bool: def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 1 + @dataclass(frozen=True) class OrderByNode(UnaryNode): @@ -304,6 +406,15 @@ def __post_init__(self): def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 0 + + @property + def relation_ops_created(self) -> int: + # Doesnt directly create any relational operations + return 0 + @dataclass(frozen=True) class ReversedNode(UnaryNode): @@ -313,11 +424,26 @@ class ReversedNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 0 + + @property + def relation_ops_created(self) -> int: + # Doesnt directly create any relational operations + return 0 + @dataclass(frozen=True) class ProjectionNode(UnaryNode): assignments: typing.Tuple[typing.Tuple[ex.Expression, str], ...] + def __post_init__(self): + input_types = self.child.schema._mapping + for expression, id in self.assignments: + # throws TypeError if invalid + _ = expression.output_type(input_types) + def __hash__(self): return self._node_hash @@ -332,6 +458,12 @@ def schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(items) + @property + def variables_introduced(self) -> int: + # ignore passthrough expressions + new_vars = sum(1 for i in self.assignments if not i[0].is_identity) + return new_vars + # TODO: Merge RowCount into Aggregate Node? # Row count can be compute from table metadata sometimes, so it is a bit special. @@ -351,6 +483,10 @@ def schema(self) -> schemata.ArraySchema: (schemata.SchemaItem("count", bigframes.dtypes.INT_DTYPE),) ) + @property + def variables_introduced(self) -> int: + return 1 + @dataclass(frozen=True) class AggregateNode(UnaryNode): @@ -365,10 +501,6 @@ def row_preserving(self) -> bool: def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: return True @@ -388,6 +520,10 @@ def schema(self) -> schemata.ArraySchema: ) return schemata.ArraySchema(tuple([*by_items, *agg_items])) + @property + def variables_introduced(self) -> int: + return len(self.aggregations) + len(self.by_column_ids) + @dataclass(frozen=True) class WindowOpNode(UnaryNode): @@ -401,10 +537,6 @@ class WindowOpNode(UnaryNode): def __hash__(self): return self._node_hash - @property - def peekable(self) -> bool: - return False - @property def non_local(self) -> bool: return True @@ -421,12 +553,31 @@ def schema(self) -> schemata.ArraySchema: schemata.SchemaItem(self.output_name, new_item_dtype) ) + @property + def variables_introduced(self) -> int: + return 1 + + @property + def relation_ops_created(self) -> int: + # Assume that if not reprojecting, that there is a sequence of window operations sharing the same window + return 0 if self.skip_reproject_unsafe else 4 + +# TODO: Remove this op @dataclass(frozen=True) class ReprojectOpNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def variables_introduced(self) -> int: + return 0 + + @property + def relation_ops_created(self) -> int: + # This op is not a real transformation, just a hint to the sql generator + return 0 + @dataclass(frozen=True) class UnpivotNode(UnaryNode): @@ -454,8 +605,8 @@ def non_local(self) -> bool: return True @property - def peekable(self) -> bool: - return False + def joins(self) -> bool: + return True @functools.cached_property def schema(self) -> schemata.ArraySchema: @@ -498,6 +649,17 @@ def infer_dtype( ] return schemata.ArraySchema((*index_items, *value_items, *passthrough_items)) + @property + def variables_introduced(self) -> int: + return ( + len(self.schema.items) - len(self.passthrough_columns) + OVERHEAD_VARIABLES + ) + + @property + def relation_ops_created(self) -> int: + # Unpivot is essentially a cross join and a projection. + return 2 + @dataclass(frozen=True) class RandomSampleNode(UnaryNode): @@ -513,3 +675,42 @@ def row_preserving(self) -> bool: def __hash__(self): return self._node_hash + + @property + def variables_introduced(self) -> int: + return 1 + + +@dataclass(frozen=True) +class ExplodeNode(UnaryNode): + column_ids: typing.Tuple[str, ...] + + @property + def row_preserving(self) -> bool: + return False + + def __hash__(self): + return self._node_hash + + @functools.cached_property + def schema(self) -> schemata.ArraySchema: + items = tuple( + schemata.SchemaItem( + name, + bigframes.dtypes.arrow_dtype_to_bigframes_dtype( + self.child.schema.get_type(name).pyarrow_dtype.value_type + ), + ) + if name in self.column_ids + else schemata.SchemaItem(name, self.child.schema.get_type(name)) + for name in self.child.schema.names + ) + return schemata.ArraySchema(items) + + @property + def relation_ops_created(self) -> int: + return 3 + + @functools.cached_property + def variables_introduced(self) -> int: + return len(self.column_ids) + 1 diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 96bf556101..a2851bc256 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -20,7 +20,6 @@ import pandas as pd import bigframes.constants as constants -import bigframes.core.global_session as global_session import bigframes.dataframe import bigframes.operations as ops import bigframes.series @@ -52,21 +51,7 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - if not isinstance(arg, bigframes.series.Series): - # This block ensures compatibility with local data formats, including - # iterables and pandas.Series - # TODO: Currently, data upload is performed using pandas DataFrames - # combined with the `read_pandas` method due to the BigFrames DataFrame - # constructor's limitations in handling various data types. Plan to update - # the upload process to utilize the BigFrames DataFrame constructor directly - # once it is enhanced for more related datatypes. - arg = global_session.with_default_session( - bigframes.session.Session.read_pandas, pd.DataFrame(arg) - ) - if len(arg.columns) != 1: - raise ValueError("Input must be 1-dimensional.") - - arg = arg[arg.columns[0]] + arg = bigframes.series.Series(arg) if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore raise NotImplementedError( diff --git a/bigframes/core/traversal.py b/bigframes/core/traversal.py deleted file mode 100644 index b038ee6599..0000000000 --- a/bigframes/core/traversal.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import bigframes.core.nodes as nodes - - -def is_trivially_executable(node: nodes.BigFrameNode) -> bool: - if local_only(node): - return True - children_trivial = all(is_trivially_executable(child) for child in node.child_nodes) - self_trivial = (not node.non_local) and (node.row_preserving) - return children_trivial and self_trivial - - -def local_only(node: nodes.BigFrameNode) -> bool: - return all(isinstance(node, nodes.ReadLocalNode) for node in node.roots) diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py new file mode 100644 index 0000000000..125a7e6bff --- /dev/null +++ b/bigframes/core/tree_properties.py @@ -0,0 +1,85 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import functools +import itertools +from typing import Dict + +import bigframes.core.nodes as nodes + + +def is_trivially_executable(node: nodes.BigFrameNode) -> bool: + if local_only(node): + return True + children_trivial = all(is_trivially_executable(child) for child in node.child_nodes) + self_trivial = (not node.non_local) and (node.row_preserving) + return children_trivial and self_trivial + + +def local_only(node: nodes.BigFrameNode) -> bool: + return all(isinstance(node, nodes.ReadLocalNode) for node in node.roots) + + +def peekable(node: nodes.BigFrameNode) -> bool: + if local_only(node): + return True + children_peekable = all(peekable(child) for child in node.child_nodes) + self_peekable = not node.non_local + return children_peekable and self_peekable + + +def count_complex_nodes( + root: nodes.BigFrameNode, min_complexity: float, max_complexity: float +) -> Dict[nodes.BigFrameNode, int]: + @functools.cache + def _node_counts_inner( + subtree: nodes.BigFrameNode, + ) -> Dict[nodes.BigFrameNode, int]: + """Helper function to count occurences of duplicate nodes in a subtree. Considers only nodes in a complexity range""" + empty_counts: Dict[nodes.BigFrameNode, int] = {} + if subtree.planning_complexity >= min_complexity: + child_counts = [_node_counts_inner(child) for child in subtree.child_nodes] + node_counts = functools.reduce(_combine_counts, child_counts, empty_counts) + if subtree.planning_complexity <= max_complexity: + return _combine_counts(node_counts, {subtree: 1}) + else: + return node_counts + return empty_counts + + return _node_counts_inner(root) + + +def replace_nodes( + root: nodes.BigFrameNode, + to_replace: nodes.BigFrameNode, + replacemenet: nodes.BigFrameNode, +): + @functools.cache + def apply_substition(n: nodes.BigFrameNode) -> nodes.BigFrameNode: + if n == to_replace: + return replacemenet + else: + return n.transform_children(apply_substition) + + return root.transform_children(apply_substition) + + +def _combine_counts( + left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int] +) -> Dict[nodes.BigFrameNode, int]: + return { + key: left.get(key, 0) + right.get(key, 0) + for key in itertools.chain(left.keys(), right.keys()) + } diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 07dae2c53b..460d1056a3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -50,6 +50,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.convert import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.guid @@ -306,6 +307,11 @@ def empty(self) -> bool: def values(self) -> numpy.ndarray: return self.to_numpy() + @property + def bqclient(self) -> bigframes.Session: + """BigQuery REST API Client the DataFrame uses for operations.""" + return self._session.bqclient + @property def _session(self) -> bigframes.Session: return self._get_block().expr.session @@ -658,22 +664,20 @@ def _apply_binop( how: str = "outer", reverse: bool = False, ): - if isinstance(other, (float, int)): + if isinstance(other, (float, int, bool)): return self._apply_scalar_binop(other, op, reverse=reverse) - elif isinstance(other, indexes.Index): - return self._apply_series_binop( - other.to_series(index=self.index), - op, - axis=axis, - how=how, - reverse=reverse, - ) - elif isinstance(other, bigframes.series.Series): - return self._apply_series_binop( - other, op, axis=axis, how=how, reverse=reverse - ) elif isinstance(other, DataFrame): return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) + elif isinstance(other, pandas.DataFrame): + return self._apply_dataframe_binop( + DataFrame(other), op, how=how, reverse=reverse + ) + elif utils.get_axis_number(axis) == 0: + bf_series = bigframes.core.convert.to_bf_series(other, self.index) + return self._apply_series_binop_axis_0(bf_series, op, how, reverse) + elif utils.get_axis_number(axis) == 1: + pd_series = bigframes.core.convert.to_pd_series(other, self.columns) + return self._apply_series_binop_axis_1(pd_series, op, how, reverse) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" @@ -695,22 +699,13 @@ def _apply_scalar_binop( block = block.drop_columns([column_id]) return DataFrame(block) - def _apply_series_binop( + def _apply_series_binop_axis_0( self, other: bigframes.series.Series, op: ops.BinaryOp, - axis: str | int = "columns", how: str = "outer", reverse: bool = False, ) -> DataFrame: - if axis not in ("columns", "index", 0, 1): - raise ValueError(f"Invalid input: axis {axis}.") - - if axis in ("columns", 1): - raise NotImplementedError( - f"Row Series operations haven't been supported. {constants.FEEDBACK_LINK}" - ) - block, (get_column_left, get_column_right) = self._block.join( other._block, how=how ) @@ -733,6 +728,63 @@ def _apply_series_binop( block = block.with_index_labels(self.index.names) return DataFrame(block) + def _apply_series_binop_axis_1( + self, + other: pandas.Series, + op: ops.BinaryOp, + how: str = "outer", + reverse: bool = False, + ) -> DataFrame: + # Somewhat different alignment than df-df so separate codepath for now. + if self.columns.equals(other.index): + columns, lcol_indexer, rcol_indexer = self.columns, None, None + else: + if not (self.columns.is_unique and other.index.is_unique): + raise ValueError("Cannot align non-unique indices") + columns, lcol_indexer, rcol_indexer = self.columns.join( + other.index, how=how, return_indexers=True + ) + + binop_result_ids = [] + + column_indices = zip( + lcol_indexer if (lcol_indexer is not None) else range(len(columns)), + rcol_indexer if (rcol_indexer is not None) else range(len(columns)), + ) + + block = self._block + for left_index, right_index in column_indices: + if left_index >= 0 and right_index >= 0: # -1 indices indicate missing + self_col_id = self._block.value_columns[left_index] + other_scalar = other.iloc[right_index] + expr = ( + op.as_expr(ex.const(other_scalar), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(other_scalar)) + ) + elif left_index >= 0: + self_col_id = self._block.value_columns[left_index] + expr = ( + op.as_expr(ex.const(None), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(None)) + ) + elif right_index >= 0: + other_scalar = other.iloc[right_index] + expr = ( + op.as_expr(ex.const(other_scalar), ex.const(None)) + if reverse + else op.as_expr(ex.const(None), ex.const(other_scalar)) + ) + else: + # Should not be possible + raise ValueError("No right or left index.") + block, result_col_id = block.project_expr(expr) + binop_result_ids.append(result_col_id) + + block = block.select_columns(binop_result_ids) + return DataFrame(block.with_column_labels(columns)) + def _apply_dataframe_binop( self, other: DataFrame, @@ -1071,7 +1123,7 @@ def to_pandas( downsampled rows and all columns of this DataFrame. """ # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job - + self._optimize_query_complexity() df, query_job = self._block.to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, @@ -1083,6 +1135,7 @@ def to_pandas( def to_pandas_batches(self) -> Iterable[pandas.DataFrame]: """Stream DataFrame results to an iterable of pandas DataFrame""" + self._optimize_query_complexity() return self._block.to_pandas_batches() def _compute_dry_run(self) -> bigquery.QueryJob: @@ -1120,7 +1173,7 @@ def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: if maybe_result is None: if force: self._cached() - maybe_result = self._block.try_peek(n) + maybe_result = self._block.try_peek(n, force=True) assert maybe_result is not None else: raise ValueError( @@ -1493,6 +1546,17 @@ def sort_values( ) return DataFrame(self._block.order_by(ordering)) + def eval(self, expr: str) -> DataFrame: + import bigframes.core.eval as bf_eval + + return bf_eval.eval(self, expr, target=self) + + def query(self, expr: str) -> DataFrame: + import bigframes.core.eval as bf_eval + + eval_result = bf_eval.eval(self, expr, target=None) + return self[eval_result] + def value_counts( self, subset: typing.Union[blocks.Label, typing.Sequence[blocks.Label]] = None, @@ -2516,6 +2580,36 @@ def sample( )[0] ) + def explode( + self, + column: typing.Union[blocks.Label, typing.Sequence[blocks.Label]], + *, + ignore_index: Optional[bool] = False, + ) -> DataFrame: + if not utils.is_list_like(column): + column_labels = typing.cast(typing.Sequence[blocks.Label], (column,)) + else: + column_labels = typing.cast(typing.Sequence[blocks.Label], tuple(column)) + + if not column_labels: + raise ValueError("column must be nonempty") + if len(column_labels) > len(set(column_labels)): + raise ValueError("column must be unique") + + column_ids = [self._resolve_label_exact(label) for label in column_labels] + missing = [ + column_labels[i] for i in range(len(column_ids)) if column_ids[i] is None + ] + if len(missing) > 0: + raise KeyError(f"None of {missing} are in the columns") + + return DataFrame( + self._block.explode( + column_ids=typing.cast(typing.Sequence[str], tuple(column_ids)), + ignore_index=ignore_index, + ) + ) + def _split( self, ns: Iterable[int] = (), @@ -2986,6 +3080,7 @@ def _run_io_query( """Executes a query job presenting this dataframe and returns the destination table.""" session = self._block.expr.session + self._optimize_query_complexity() export_array, id_overrides = self._prepare_export( index=index, ordering_id=ordering_id ) @@ -3122,6 +3217,14 @@ def _cached(self, *, force: bool = False) -> DataFrame: self._set_block(self._block.cached(force=force)) return self + def _optimize_query_complexity(self): + """Reduce query complexity by caching repeated subtrees and recursively materializing maximum-complexity subtrees. + May generate many queries and take substantial time to execute. + """ + # TODO: Move all this to session + new_expr = self._session._simplify_with_caching(self._block.expr) + self._set_block(self._block.swap_array_expr(new_expr)) + _DataFrameOrSeries = typing.TypeVar("_DataFrameOrSeries") def dot(self, other: _DataFrameOrSeries) -> _DataFrameOrSeries: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 63adc059f3..c5bf5db2fe 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -47,13 +47,19 @@ # None represents the type of a None scalar. ExpressionType = typing.Optional[Dtype] -# Used when storing Null expressions -DEFAULT_DTYPE = pd.Float64Dtype() INT_DTYPE = pd.Int64Dtype() FLOAT_DTYPE = pd.Float64Dtype() BOOL_DTYPE = pd.BooleanDtype() STRING_DTYPE = pd.StringDtype(storage="pyarrow") +BYTES_DTYPE = pd.ArrowDtype(pa.binary()) +DATE_DTYPE = pd.ArrowDtype(pa.date32()) +TIME_DTYPE = pd.ArrowDtype(pa.time64("us")) +DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us")) +TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) + +# Used when storing Null expressions +DEFAULT_DTYPE = FLOAT_DTYPE # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable UNORDERED_DTYPES = [gpd.array.GeometryDtype()] @@ -100,6 +106,46 @@ pd.ArrowDtype(pa.decimal256(76, 38)), ] + +## dtype predicates - use these to maintain consistency +def is_datetime_like(type: ExpressionType) -> bool: + return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE) + + +def is_date_like(type: ExpressionType) -> bool: + return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, DATE_DTYPE) + + +def is_time_like(type: ExpressionType) -> bool: + return type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE) + + +def is_binary_like(type: ExpressionType) -> bool: + return type in (BOOL_DTYPE, BYTES_DTYPE, INT_DTYPE) + + +def is_string_like(type: ExpressionType) -> bool: + return type in (STRING_DTYPE, BYTES_DTYPE) + + +def is_array_like(type: ExpressionType) -> bool: + return isinstance(type, pd.ArrowDtype) and isinstance( + type.pyarrow_dtype, pa.ListType + ) + + +def is_numeric(type: ExpressionType) -> bool: + return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + + +def is_iterable(type: ExpressionType) -> bool: + return type in (STRING_DTYPE, BYTES_DTYPE) or is_array_like(type) + + +def is_comparable(type: ExpressionType) -> bool: + return (type is not None) and (type not in UNORDERED_DTYPES) + + # Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame ReadOnlyIbisDtype = Union[ ibis_dtypes.Binary, @@ -305,6 +351,10 @@ def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: ) +def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: + return ibis_dtype_to_bigframes_dtype(arrow_dtype_to_ibis_dtype(arrow_dtype)) + + def bigframes_dtype_to_ibis_dtype( bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] ) -> ibis_dtypes.DataType: @@ -605,6 +655,7 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype: + """Get the supertype of the two types.""" if dtype1 == dtype2: return dtype1 # Implicit conversion currently only supported for numeric types @@ -621,12 +672,26 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype: return hierarchy[lcd_index] -def lcd_etype(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType: - if etype1 is None: +def coerce_to_common(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType: + """Coerce types to a common type or throw a TypeError""" + if etype1 is not None and etype2 is not None: + common_supertype = lcd_type(etype1, etype2) + if common_supertype is not None: + return common_supertype + if can_coerce(etype1, etype2): return etype2 - if etype2 is None: + if can_coerce(etype2, etype1): return etype1 - return lcd_type_or_throw(etype1, etype2) + raise TypeError(f"Cannot coerce {etype1} and {etype2} to a common type.") + + +def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool: + if source_type is None: + return True # None can be coerced to any supported type + else: + return (source_type == STRING_DTYPE) and ( + target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE) + ) def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index e58ed4feef..c57cb78791 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -77,7 +77,7 @@ def fit_transform(self, x_train: Union[DataFrame, Series], y_train: Union[DataFr """ def __repr__(self): - """Print the estimator's constructor with all non-default parameter values""" + """Print the estimator's constructor with all non-default parameter values.""" # Estimator pretty printer adapted from Sklearn's, which is in turn an adaption of # the inbuilt pretty-printer in CPython @@ -106,13 +106,13 @@ def predict(self, X): def register(self: _T, vertex_ai_model_id: Optional[str] = None) -> _T: """Register the model to Vertex AI. - After register, go to Google Cloud Console (https://console.cloud.google.com/vertex-ai/models) + After register, go to the Google Cloud console (https://console.cloud.google.com/vertex-ai/models) to manage the model registries. Refer to https://cloud.google.com/vertex-ai/docs/model-registry/introduction for more options. Args: vertex_ai_model_id (Optional[str], default None): - optional string id as model id in Vertex. If not set, will by default to 'bigframes_{bq_model_id}'. + Optional string id as model id in Vertex. If not set, will default to 'bigframes_{bq_model_id}'. Vertex Ai model id will be truncated to 63 characters due to its limitation. Returns: @@ -178,7 +178,33 @@ def fit( return self._fit(X, y) -class Transformer(BaseEstimator): +class BaseTransformer(BaseEstimator): + """Transformer base class.""" + + def __init__(self): + self._bqml_model: Optional[core.BqmlModel] = None + + _T = TypeVar("_T", bound="BaseTransformer") + + def to_gbq(self: _T, model_name: str, replace: bool = False) -> _T: + """Save the transformer as a BigQuery model. + + Args: + model_name (str): + The name of the model. + replace (bool, default False): + Whether to replace if the model already exists. Default to False. + + Returns: + Saved transformer.""" + if not self._bqml_model: + raise RuntimeError("A transformer must be fitted before it can be saved") + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + + +class Transformer(BaseTransformer): """A BigQuery DataFrames Transformer base class that transforms data. Also the transformers can be attached to a pipeline with a predictor.""" @@ -199,7 +225,7 @@ def fit_transform( return self.fit(X, y).transform(X) -class LabelTransformer(BaseEstimator): +class LabelTransformer(BaseTransformer): """A BigQuery DataFrames Label Transformer base class that transforms data. Also the transformers can be attached to a pipeline with a predictor.""" diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index d35941b338..89969f23e7 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -13,29 +13,34 @@ # limitations under the License. """Build composite transformers on heterogeneous data. This module is styled -after Scikit-Learn's compose module: +after scikit-Learn's compose module: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.compose.""" from __future__ import annotations +import re +import types import typing -from typing import List, Optional, Tuple, Union +from typing import cast, List, Optional, Tuple, Union import bigframes_vendored.sklearn.compose._column_transformer +from google.cloud import bigquery from bigframes import constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd -CompilablePreprocessorType = Union[ - preprocessing.OneHotEncoder, - preprocessing.StandardScaler, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, -] +_BQML_TRANSFROM_TYPE_MAPPING = types.MappingProxyType( + { + "ML.STANDARD_SCALER": preprocessing.StandardScaler, + "ML.ONE_HOT_ENCODER": preprocessing.OneHotEncoder, + "ML.MAX_ABS_SCALER": preprocessing.MaxAbsScaler, + "ML.MIN_MAX_SCALER": preprocessing.MinMaxScaler, + "ML.BUCKETIZE": preprocessing.KBinsDiscretizer, + "ML.LABEL_ENCODER": preprocessing.LabelEncoder, + } +) @log_adapter.class_logger @@ -52,7 +57,7 @@ def __init__( transformers: List[ Tuple[ str, - CompilablePreprocessorType, + preprocessing.PreprocessingType, Union[str, List[str]], ] ], @@ -67,17 +72,16 @@ def __init__( @property def transformers_( self, - ) -> List[Tuple[str, CompilablePreprocessorType, str,]]: + ) -> List[Tuple[str, preprocessing.PreprocessingType, str,]]: """The collection of transformers as tuples of (name, transformer, column).""" result: List[ Tuple[ str, - CompilablePreprocessorType, + preprocessing.PreprocessingType, str, ] ] = [] - column_set: set[str] = set() for entry in self.transformers: name, transformer, column_or_columns = entry columns = ( @@ -87,14 +91,90 @@ def transformers_( ) for column in columns: - if column in column_set: - raise NotImplementedError( - f"Chained transformers on the same column isn't supported. {constants.FEEDBACK_LINK}" - ) result.append((name, transformer, column)) return result + @classmethod + def _extract_from_bq_model( + cls, + bq_model: bigquery.Model, + ) -> ColumnTransformer: + """Extract transformers as ColumnTransformer obj from a BQ Model. Keep the _bqml_model field as None.""" + assert "transformColumns" in bq_model._properties + + transformers: List[ + Tuple[ + str, + preprocessing.PreprocessingType, + Union[str, List[str]], + ] + ] = [] + + def camel_to_snake(name): + name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() + + output_names = [] + for transform_col in bq_model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) + # pass the columns that are not transformed + if "transformSql" not in transform_col_dict: + continue + transform_sql: str = transform_col_dict["transformSql"] + if not transform_sql.startswith("ML."): + continue + + output_names.append(transform_col_dict["name"]) + found_transformer = False + for prefix in _BQML_TRANSFROM_TYPE_MAPPING: + if transform_sql.startswith(prefix): + transformer_cls = _BQML_TRANSFROM_TYPE_MAPPING[prefix] + transformers.append( + ( + camel_to_snake(transformer_cls.__name__), + *transformer_cls._parse_from_sql(transform_sql), # type: ignore + ) + ) + + found_transformer = True + break + if not found_transformer: + raise NotImplementedError( + f"Unsupported transformer type. {constants.FEEDBACK_LINK}" + ) + + transformer = cls(transformers=transformers) + transformer._output_names = output_names + + return transformer + + def _merge( + self, bq_model: bigquery.Model + ) -> Union[ColumnTransformer, preprocessing.PreprocessingType,]: + """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer.""" + transformers = self.transformers_ + + assert len(transformers) > 0 + _, transformer_0, column_0 = transformers[0] + columns = [column_0] + for _, transformer, column in transformers[1:]: + # all transformers are the same + if transformer != transformer_0: + return self + columns.append(column) + # all feature columns are transformed + if sorted( + [ + cast(str, feature_column.name) + for feature_column in bq_model.feature_columns + ] + ) == sorted(columns): + transformer_0._output_names = self._output_names + return transformer_0 + + return self + def _compile_to_sql( self, columns: List[str], diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 03d9b806b9..04aaeec1bc 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -152,14 +152,14 @@ def generate_text( ), ) - def generate_text_embedding( + def generate_embedding( self, input_data: bpd.DataFrame, options: Mapping[str, int | float], ) -> bpd.DataFrame: return self._apply_sql( input_data, - lambda source_df: self._model_manipulation_sql_generator.ml_generate_text_embedding( + lambda source_df: self._model_manipulation_sql_generator.ml_generate_embedding( source_df=source_df, struct_options=options, ), diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 72ea600c58..a8f0329145 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Ensemble models. This module is styled after Scikit-Learn's ensemble module: +"""Ensemble models. This module is styled after scikit-learn's ensemble module: https://scikit-learn.org/stable/modules/ensemble.html""" from __future__ import annotations @@ -190,9 +190,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: saved model.""" if not self._bqml_model: @@ -343,9 +343,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: XGBClassifier: saved model.""" @@ -506,9 +506,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: RandomForestRegressor: saved model.""" @@ -669,9 +669,9 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestClassifi Args: model_name (str): - the name of the model. + The name of the model. replace (bool, default False): - whether to replace if the model already exists. Default to False. + Whether to replace if the model already exists. Default to False. Returns: RandomForestClassifier: saved model.""" diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 292389dcbb..e50a8ed35b 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -234,7 +234,12 @@ def _fit( def predict( self, X=None, *, horizon: int = 3, confidence_level: float = 0.95 ) -> bpd.DataFrame: - """Predict the closest cluster for each sample in X. + """Forecast time series at future horizon. + + .. note:: + + Output matches that of the BigQuery ML.FORECAST function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-forecast Args: X (default None): @@ -243,12 +248,12 @@ def predict( an int value that specifies the number of time points to forecast. The default value is 3, and the maximum value is 1000. confidence_level (float, default 0.95): - a float value that specifies percentage of the future values that fall in the prediction interval. + A float value that specifies percentage of the future values that fall in the prediction interval. The valid input range is [0.0, 1.0). Returns: bigframes.dataframe.DataFrame: The predicted DataFrames. Which - contains 2 columns "forecast_timestamp" and "forecast_value". + contains 2 columns: "forecast_timestamp" and "forecast_value". """ if horizon < 1 or horizon > 1000: raise ValueError(f"horizon must be [1, 1000], but is {horizon}.") @@ -279,7 +284,7 @@ def detect_anomalies( Identifies the custom threshold to use for anomaly detection. The value must be in the range [0, 1), with a default value of 0.95. Returns: - bigframes.dataframe.DataFrame: detected DataFrame.""" + bigframes.dataframe.DataFrame: Detected DataFrame.""" if anomaly_prob_threshold < 0.0 or anomaly_prob_threshold >= 1.0: raise ValueError( f"anomaly_prob_threshold must be [0.0, 1.0), but is {anomaly_prob_threshold}." diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 7f75827083..b551150050 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -34,7 +34,7 @@ class TensorFlowModel(base.Predictor): model_path (str): GCS path that holds the model files. session (BigQuery Session): - BQ session to create the model + BQ session to create the model. """ def __init__( @@ -113,7 +113,7 @@ class ONNXModel(base.Predictor): model_path (str): Cloud Storage path that holds the model files. session (BigQuery Session): - BQ session to create the model + BQ session to create the model. """ def __init__( @@ -207,7 +207,7 @@ class XGBoostModel(base.Predictor): and feature_types are both specified in the model file. Supported types are "bool", "string", "int64", "float64", "array", "array", "array", "array". session (BigQuery Session): - BQ session to create the model + BQ session to create the model. """ def __init__( diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 6c4ae2ea43..ffaeb399bb 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -389,7 +389,14 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: "flatten_json_output": True, } - df = self._bqml_model.generate_text_embedding(X, options) + df = self._bqml_model.generate_embedding(X, options) + df = df.rename( + columns={ + "ml_generate_embedding_result": "text_embedding", + "ml_generate_embedding_statistics": "statistics", + "ml_generate_embedding_status": _ML_EMBED_TEXT_STATUS, + } + ) if (df[_ML_EMBED_TEXT_STATUS] != "").any(): warnings.warn( @@ -421,6 +428,12 @@ def to_gbq( class GeminiTextGenerator(base.BaseEstimator): """Gemini text generator LLM model. + .. note:: + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 31912a0129..c6e38e6534 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -23,6 +23,8 @@ import bigframes.constants as constants from bigframes.ml import ( cluster, + compose, + core, decomposition, ensemble, forecasting, @@ -30,6 +32,7 @@ linear_model, llm, pipeline, + preprocessing, utils, ) @@ -79,6 +82,8 @@ def from_bq( llm.PaLM2TextGenerator, llm.PaLM2TextEmbeddingGenerator, pipeline.Pipeline, + compose.ColumnTransformer, + preprocessing.PreprocessingType, ]: """Load a BQML model to BigQuery DataFrames ML. @@ -89,22 +94,36 @@ def from_bq( Returns: A BigQuery DataFrames ML model object. """ + # TODO(garrettwu): the entire condition only to TRANSFORM_ONLY when b/331679273 is fixed. + if ( + bq_model.model_type == "TRANSFORM_ONLY" + or bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + and "transformColumns" in bq_model._properties + and not _is_bq_model_remote(bq_model) + ): + return _transformer_from_bq(session, bq_model) + if _is_bq_model_pipeline(bq_model): return pipeline.Pipeline._from_bq(session, bq_model) return _model_from_bq(session, bq_model) +def _transformer_from_bq(session: bigframes.Session, bq_model: bigquery.Model): + transformer = compose.ColumnTransformer._extract_from_bq_model(bq_model)._merge( + bq_model + ) + transformer._bqml_model = core.BqmlModel(session, bq_model) + + return transformer + + def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): if bq_model.model_type in _BQML_MODEL_TYPE_MAPPING: return _BQML_MODEL_TYPE_MAPPING[bq_model.model_type]._from_bq( # type: ignore session=session, model=bq_model ) - if ( - bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" - and "remoteModelInfo" in bq_model._properties - and "endpoint" in bq_model._properties["remoteModelInfo"] - ): + if _is_bq_model_remote(bq_model): # Parse the remote model endpoint bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] model_endpoint = bqml_endpoint.split("/")[-1] @@ -121,3 +140,11 @@ def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): def _is_bq_model_pipeline(bq_model: bigquery.Model) -> bool: return "transformColumns" in bq_model._properties + + +def _is_bq_model_remote(bq_model: bigquery.Model) -> bool: + return ( + bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + and "remoteModelInfo" in bq_model._properties + and "endpoint" in bq_model._properties["remoteModelInfo"] + ) diff --git a/bigframes/ml/metrics/__init__.py b/bigframes/ml/metrics/__init__.py index 6b0a243426..e79b46877b 100644 --- a/bigframes/ml/metrics/__init__.py +++ b/bigframes/ml/metrics/__init__.py @@ -18,6 +18,7 @@ auc, confusion_matrix, f1_score, + mean_squared_error, precision_score, r2_score, recall_score, @@ -35,5 +36,6 @@ "confusion_matrix", "precision_score", "f1_score", + "mean_squared_error", "pairwise", ] diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index e8c7400f35..2525ecd34f 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -13,14 +13,14 @@ # limitations under the License. """Metrics functions for evaluating models. This module is styled after -Scikit-Learn's metrics module: https://scikit-learn.org/stable/modules/metrics.html.""" +scikit-learn's metrics module: https://scikit-learn.org/stable/modules/metrics.html.""" import inspect import typing from typing import Tuple, Union -import bigframes_vendored.sklearn.metrics._classification as vendored_mertics_classification -import bigframes_vendored.sklearn.metrics._ranking as vendored_mertics_ranking +import bigframes_vendored.sklearn.metrics._classification as vendored_metrics_classification +import bigframes_vendored.sklearn.metrics._ranking as vendored_metrics_ranking import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression import numpy as np import pandas as pd @@ -79,7 +79,7 @@ def accuracy_score( return score.sum() -accuracy_score.__doc__ = inspect.getdoc(vendored_mertics_classification.accuracy_score) +accuracy_score.__doc__ = inspect.getdoc(vendored_metrics_classification.accuracy_score) def roc_curve( @@ -149,7 +149,7 @@ def roc_curve( ) -roc_curve.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_curve) +roc_curve.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_curve) def roc_auc_score( @@ -161,17 +161,13 @@ def roc_auc_score( fpr, tpr, _ = roc_curve(y_true_series, y_score_series, drop_intermediate=False) - # TODO(bmil): remove this once bigframes supports the necessary operations - pd_fpr = fpr.to_pandas() - pd_tpr = tpr.to_pandas() - # Use the trapezoid rule to compute the area under the ROC curve - width_diff = pd_fpr.diff().iloc[1:].reset_index(drop=True) - height_avg = (pd_tpr.iloc[:-1] + pd_tpr.iloc[1:].reset_index(drop=True)) / 2 - return (width_diff * height_avg).sum() + width_diff = fpr.diff().iloc[1:].reset_index(drop=True) + height_avg = (tpr.iloc[:-1] + tpr.iloc[1:].reset_index(drop=True)) / 2 + return typing.cast(float, (width_diff * height_avg).sum()) -roc_auc_score.__doc__ = inspect.getdoc(vendored_mertics_ranking.roc_auc_score) +roc_auc_score.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_auc_score) def auc( @@ -185,7 +181,7 @@ def auc( return auc -auc.__doc__ = inspect.getdoc(vendored_mertics_ranking.auc) +auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc) def confusion_matrix( @@ -223,7 +219,7 @@ def confusion_matrix( confusion_matrix.__doc__ = inspect.getdoc( - vendored_mertics_classification.confusion_matrix + vendored_metrics_classification.confusion_matrix ) @@ -261,7 +257,7 @@ def recall_score( return recall_score -recall_score.__doc__ = inspect.getdoc(vendored_mertics_classification.recall_score) +recall_score.__doc__ = inspect.getdoc(vendored_metrics_classification.recall_score) def precision_score( @@ -299,7 +295,7 @@ def precision_score( precision_score.__doc__ = inspect.getdoc( - vendored_mertics_classification.precision_score + vendored_metrics_classification.precision_score ) @@ -334,4 +330,18 @@ def f1_score( return f1_score -f1_score.__doc__ = inspect.getdoc(vendored_mertics_classification.f1_score) +f1_score.__doc__ = inspect.getdoc(vendored_metrics_classification.f1_score) + + +def mean_squared_error( + y_true: Union[bpd.DataFrame, bpd.Series], + y_pred: Union[bpd.DataFrame, bpd.Series], +) -> float: + y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred) + + return (y_pred_series - y_true_series).pow(2).sum() / len(y_true_series) + + +mean_squared_error.__doc__ = inspect.getdoc( + vendored_metrics_regression.mean_squared_error +) diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 443b9e7be6..42c13fdb40 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -13,7 +13,7 @@ # limitations under the License. """Functions for test/train split and model tuning. This module is styled after -Scikit-Learn's model_selection module: +scikit-learn's model_selection module: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection.""" @@ -51,7 +51,7 @@ def train_test_split( List[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]]: A list of BigQuery DataFrames or Series. """ - # TODO(garrettwu): Scikit-Learn throws an error when the dataframes don't have the same + # TODO(garrettwu): scikit-learn throws an error when the dataframes don't have the same # number of rows. We probably want to do something similar. Now the implementation is based # on index. We'll move to based on ordering first. diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 9289b613b8..5df2378575 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""For composing estimators together. This module is styled after Scikit-Learn's +"""For composing estimators together. This module is styled after scikit-learn's pipeline module: https://scikit-learn.org/stable/modules/pipeline.html.""" from __future__ import annotations -from typing import cast, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import bigframes_vendored.sklearn.pipeline from google.cloud import bigquery @@ -83,8 +83,8 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): @classmethod def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> Pipeline: - col_transformer = _extract_as_column_transformer(bq_model) - transform = _merge_column_transformer(bq_model, col_transformer) + col_transformer = compose.ColumnTransformer._extract_from_bq_model(bq_model) + transform = col_transformer._merge(bq_model) estimator = loader._model_from_bq(session, bq_model) return cls([("transform", transform), ("estimator", estimator)]) @@ -138,110 +138,3 @@ def to_gbq(self, model_name: str, replace: bool = False) -> Pipeline: new_model = self._estimator._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) - - -def _extract_as_column_transformer( - bq_model: bigquery.Model, -) -> compose.ColumnTransformer: - """Extract transformers as ColumnTransformer obj from a BQ Model.""" - assert "transformColumns" in bq_model._properties - - transformers: List[ - Tuple[ - str, - Union[ - preprocessing.OneHotEncoder, - preprocessing.StandardScaler, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, - ], - Union[str, List[str]], - ] - ] = [] - for transform_col in bq_model._properties["transformColumns"]: - # pass the columns that are not transformed - if "transformSql" not in transform_col: - continue - - transform_sql: str = cast(dict, transform_col)["transformSql"] - if transform_sql.startswith("ML.STANDARD_SCALER"): - transformers.append( - ( - "standard_scaler", - *preprocessing.StandardScaler._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.ONE_HOT_ENCODER"): - transformers.append( - ( - "ont_hot_encoder", - *preprocessing.OneHotEncoder._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.MAX_ABS_SCALER"): - transformers.append( - ( - "max_abs_scaler", - *preprocessing.MaxAbsScaler._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.MIN_MAX_SCALER"): - transformers.append( - ( - "min_max_scaler", - *preprocessing.MinMaxScaler._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.BUCKETIZE"): - transformers.append( - ( - "k_bins_discretizer", - *preprocessing.KBinsDiscretizer._parse_from_sql(transform_sql), - ) - ) - elif transform_sql.startswith("ML.LABEL_ENCODER"): - transformers.append( - ( - "label_encoder", - *preprocessing.LabelEncoder._parse_from_sql(transform_sql), - ) - ) - else: - raise NotImplementedError( - f"Unsupported transformer type. {constants.FEEDBACK_LINK}" - ) - - return compose.ColumnTransformer(transformers=transformers) - - -def _merge_column_transformer( - bq_model: bigquery.Model, column_transformer: compose.ColumnTransformer -) -> Union[ - compose.ColumnTransformer, - preprocessing.StandardScaler, - preprocessing.OneHotEncoder, - preprocessing.MaxAbsScaler, - preprocessing.MinMaxScaler, - preprocessing.KBinsDiscretizer, - preprocessing.LabelEncoder, -]: - """Try to merge the column transformer to a simple transformer.""" - transformers = column_transformer.transformers_ - - assert len(transformers) > 0 - _, transformer_0, column_0 = transformers[0] - columns = [column_0] - for _, transformer, column in transformers[1:]: - # all transformers are the same - if transformer != transformer_0: - return column_transformer - columns.append(column) - # all feature columns are transformed - if sorted( - [cast(str, feature_column.name) for feature_column in bq_model.feature_columns] - ) == sorted(columns): - return transformer_0 - - return column_transformer diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 23eab42978..673ee27db0 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -13,7 +13,7 @@ # limitations under the License. """Transformers that prepare data for other estimators. This module is styled after -Scikit-Learn's preprocessing module: https://scikit-learn.org/stable/modules/preprocessing.html.""" +scikit-learn's preprocessing module: https://scikit-learn.org/stable/modules/preprocessing.html.""" from __future__ import annotations @@ -639,3 +639,13 @@ def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) + + +PreprocessingType = Union[ + OneHotEncoder, + StandardScaler, + MaxAbsScaler, + MinMaxScaler, + KBinsDiscretizer, + LabelEncoder, +] diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 807fadc06a..fab358cce3 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -270,12 +270,12 @@ def ml_generate_text( return f"""SELECT * FROM ML.GENERATE_TEXT(MODEL `{self._model_name}`, ({self._source_sql(source_df)}), {struct_options_sql})""" - def ml_generate_text_embedding( + def ml_generate_embedding( self, source_df: bpd.DataFrame, struct_options: Mapping[str, Union[int, float]] ) -> str: - """Encode ML.GENERATE_TEXT_EMBEDDING for BQML""" + """Encode ML.GENERATE_EMBEDDING for BQML""" struct_options_sql = self.struct_options(**struct_options) - return f"""SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `{self._model_name}`, ({self._source_sql(source_df)}), {struct_options_sql})""" def ml_detect_anomalies( diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 4ecb8dca5a..d631ba8508 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -161,97 +161,165 @@ def _convert_expr_input( # Operation Factories -def create_unary_op( - name: str, type_rule: op_typing.OpTypeRule = op_typing.INPUT_TYPE -) -> UnaryOp: +def create_unary_op(name: str, type_signature: op_typing.UnaryTypeSignature) -> UnaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore + [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_signature.as_method)], # type: ignore bases=(UnaryOp,), frozen=True, )() def create_binary_op( - name: str, type_rule: op_typing.OpTypeRule = op_typing.Supertype() + name: str, type_signature: op_typing.BinaryTypeSignature ) -> BinaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore + [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_signature.as_method)], # type: ignore bases=(BinaryOp,), frozen=True, )() -def create_ternary_op( - name: str, type_rule: op_typing.OpTypeRule = op_typing.Supertype() -) -> TernaryOp: - return dataclasses.make_dataclass( - name, - [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore - bases=(TernaryOp,), - frozen=True, - )() - - # Unary Ops ## Generic Ops -invert_op = create_unary_op(name="invert", type_rule=op_typing.INPUT_TYPE) -isnull_op = create_unary_op(name="isnull", type_rule=op_typing.PREDICATE) -notnull_op = create_unary_op(name="notnull", type_rule=op_typing.PREDICATE) -hash_op = create_unary_op(name="hash", type_rule=op_typing.INTEGER) +invert_op = create_unary_op( + name="invert", + type_signature=op_typing.TypePreserving( + dtypes.is_binary_like, + description="binary-like", + ), +) # numeric +isnull_op = create_unary_op( + name="isnull", + type_signature=op_typing.FixedOutputType( + lambda x: True, dtypes.BOOL_DTYPE, description="nullable" + ), +) +notnull_op = create_unary_op( + name="notnull", + type_signature=op_typing.FixedOutputType( + lambda x: True, dtypes.BOOL_DTYPE, description="nullable" + ), +) +hash_op = create_unary_op( + name="hash", + type_signature=op_typing.FixedOutputType( + dtypes.is_string_like, dtypes.INT_DTYPE, description="string-like" + ), +) ## String Ops -len_op = create_unary_op(name="len", type_rule=op_typing.INTEGER) -reverse_op = create_unary_op(name="reverse", type_rule=op_typing.STRING) -lower_op = create_unary_op(name="lower", type_rule=op_typing.STRING) -upper_op = create_unary_op(name="upper", type_rule=op_typing.STRING) -strip_op = create_unary_op(name="strip", type_rule=op_typing.STRING) -isalnum_op = create_unary_op(name="isalnum", type_rule=op_typing.PREDICATE) -isalpha_op = create_unary_op(name="isalpha", type_rule=op_typing.PREDICATE) -isdecimal_op = create_unary_op(name="isdecimal", type_rule=op_typing.PREDICATE) -isdigit_op = create_unary_op(name="isdigit", type_rule=op_typing.PREDICATE) -isnumeric_op = create_unary_op(name="isnumeric", type_rule=op_typing.PREDICATE) -isspace_op = create_unary_op(name="isspace", type_rule=op_typing.PREDICATE) -islower_op = create_unary_op(name="islower", type_rule=op_typing.PREDICATE) -isupper_op = create_unary_op(name="isupper", type_rule=op_typing.PREDICATE) -rstrip_op = create_unary_op(name="rstrip", type_rule=op_typing.STRING) -lstrip_op = create_unary_op(name="lstrip", type_rule=op_typing.STRING) -capitalize_op = create_unary_op(name="capitalize", type_rule=op_typing.STRING) +len_op = create_unary_op( + name="len", + type_signature=op_typing.FixedOutputType( + dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable" + ), +) +reverse_op = create_unary_op(name="reverse", type_signature=op_typing.STRING_TRANSFORM) +lower_op = create_unary_op(name="lower", type_signature=op_typing.STRING_TRANSFORM) +upper_op = create_unary_op(name="upper", type_signature=op_typing.STRING_TRANSFORM) +strip_op = create_unary_op(name="strip", type_signature=op_typing.STRING_TRANSFORM) +isalnum_op = create_unary_op(name="isalnum", type_signature=op_typing.STRING_PREDICATE) +isalpha_op = create_unary_op(name="isalpha", type_signature=op_typing.STRING_PREDICATE) +isdecimal_op = create_unary_op( + name="isdecimal", type_signature=op_typing.STRING_PREDICATE +) +isdigit_op = create_unary_op(name="isdigit", type_signature=op_typing.STRING_PREDICATE) +isnumeric_op = create_unary_op( + name="isnumeric", type_signature=op_typing.STRING_PREDICATE +) +isspace_op = create_unary_op(name="isspace", type_signature=op_typing.STRING_PREDICATE) +islower_op = create_unary_op(name="islower", type_signature=op_typing.STRING_PREDICATE) +isupper_op = create_unary_op(name="isupper", type_signature=op_typing.STRING_PREDICATE) +rstrip_op = create_unary_op(name="rstrip", type_signature=op_typing.STRING_TRANSFORM) +lstrip_op = create_unary_op(name="lstrip", type_signature=op_typing.STRING_TRANSFORM) +capitalize_op = create_unary_op( + name="capitalize", type_signature=op_typing.STRING_TRANSFORM +) ## DateTime Ops -day_op = create_unary_op(name="day", type_rule=op_typing.INTEGER) -dayofweek_op = create_unary_op(name="dayofweek", type_rule=op_typing.INTEGER) +### datelike accessors +day_op = create_unary_op( + name="day", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +month_op = create_unary_op( + name="month", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +year_op = create_unary_op( + name="year", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +dayofweek_op = create_unary_op( + name="dayofweek", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +quarter_op = create_unary_op( + name="quarter", + type_signature=op_typing.DATELIKE_ACCESSOR, +) +### timelike accessors +hour_op = create_unary_op( + name="hour", + type_signature=op_typing.TIMELIKE_ACCESSOR, +) +minute_op = create_unary_op( + name="minute", + type_signature=op_typing.TIMELIKE_ACCESSOR, +) +second_op = create_unary_op( + name="second", + type_signature=op_typing.TIMELIKE_ACCESSOR, +) +normalize_op = create_unary_op( + name="normalize", + type_signature=op_typing.TypePreserving( + dtypes.is_time_like, + description="time-like", + ), +) +### datetimelike accessors date_op = create_unary_op( - name="date", type_rule=op_typing.Fixed(pd.ArrowDtype(pa.date32())) + name="date", + type_signature=op_typing.FixedOutputType( + dtypes.is_date_like, dtypes.DATE_DTYPE, description="date-like" + ), ) -hour_op = create_unary_op(name="hour", type_rule=op_typing.INTEGER) -minute_op = create_unary_op(name="minute", type_rule=op_typing.INTEGER) -month_op = create_unary_op(name="month", type_rule=op_typing.INTEGER) -quarter_op = create_unary_op(name="quarter", type_rule=op_typing.INTEGER) -second_op = create_unary_op(name="second", type_rule=op_typing.INTEGER) time_op = create_unary_op( - name="time", type_rule=op_typing.Fixed(pd.ArrowDtype(pa.time64("us"))) + name="time", + type_signature=op_typing.FixedOutputType( + dtypes.is_time_like, dtypes.TIME_DTYPE, description="time-like" + ), ) -year_op = create_unary_op(name="year", type_rule=op_typing.INTEGER) -normalize_op = create_unary_op(name="normalize") ## Trigonometry Ops -sin_op = create_unary_op(name="sin", type_rule=op_typing.REAL_NUMERIC) -cos_op = create_unary_op(name="cos", type_rule=op_typing.REAL_NUMERIC) -tan_op = create_unary_op(name="tan", type_rule=op_typing.REAL_NUMERIC) -arcsin_op = create_unary_op(name="arcsin", type_rule=op_typing.REAL_NUMERIC) -arccos_op = create_unary_op(name="arccos", type_rule=op_typing.REAL_NUMERIC) -arctan_op = create_unary_op(name="arctan", type_rule=op_typing.REAL_NUMERIC) -sinh_op = create_unary_op(name="sinh", type_rule=op_typing.REAL_NUMERIC) -cosh_op = create_unary_op(name="cosh", type_rule=op_typing.REAL_NUMERIC) -tanh_op = create_unary_op(name="tanh", type_rule=op_typing.REAL_NUMERIC) -arcsinh_op = create_unary_op(name="arcsinh", type_rule=op_typing.REAL_NUMERIC) -arccosh_op = create_unary_op(name="arccosh", type_rule=op_typing.REAL_NUMERIC) -arctanh_op = create_unary_op(name="arctanh", type_rule=op_typing.REAL_NUMERIC) +sin_op = create_unary_op(name="sin", type_signature=op_typing.UNARY_REAL_NUMERIC) +cos_op = create_unary_op(name="cos", type_signature=op_typing.UNARY_REAL_NUMERIC) +tan_op = create_unary_op(name="tan", type_signature=op_typing.UNARY_REAL_NUMERIC) +arcsin_op = create_unary_op(name="arcsin", type_signature=op_typing.UNARY_REAL_NUMERIC) +arccos_op = create_unary_op(name="arccos", type_signature=op_typing.UNARY_REAL_NUMERIC) +arctan_op = create_unary_op(name="arctan", type_signature=op_typing.UNARY_REAL_NUMERIC) +sinh_op = create_unary_op(name="sinh", type_signature=op_typing.UNARY_REAL_NUMERIC) +cosh_op = create_unary_op(name="cosh", type_signature=op_typing.UNARY_REAL_NUMERIC) +tanh_op = create_unary_op(name="tanh", type_signature=op_typing.UNARY_REAL_NUMERIC) +arcsinh_op = create_unary_op( + name="arcsinh", type_signature=op_typing.UNARY_REAL_NUMERIC +) +arccosh_op = create_unary_op( + name="arccosh", type_signature=op_typing.UNARY_REAL_NUMERIC +) +arctanh_op = create_unary_op( + name="arctanh", type_signature=op_typing.UNARY_REAL_NUMERIC +) ## Numeric Ops -abs_op = create_unary_op(name="abs", type_rule=op_typing.INPUT_TYPE) -exp_op = create_unary_op(name="exp", type_rule=op_typing.REAL_NUMERIC) -ln_op = create_unary_op(name="log", type_rule=op_typing.REAL_NUMERIC) -log10_op = create_unary_op(name="log10", type_rule=op_typing.REAL_NUMERIC) -sqrt_op = create_unary_op(name="sqrt", type_rule=op_typing.REAL_NUMERIC) +floor_op = create_unary_op(name="floor", type_signature=op_typing.UNARY_REAL_NUMERIC) +ceil_op = create_unary_op(name="ceil", type_signature=op_typing.UNARY_REAL_NUMERIC) +abs_op = create_unary_op(name="abs", type_signature=op_typing.UNARY_NUMERIC) +exp_op = create_unary_op(name="exp", type_signature=op_typing.UNARY_REAL_NUMERIC) +expm1_op = create_unary_op(name="expm1", type_signature=op_typing.UNARY_REAL_NUMERIC) +ln_op = create_unary_op(name="log", type_signature=op_typing.UNARY_REAL_NUMERIC) +log10_op = create_unary_op(name="log10", type_signature=op_typing.UNARY_REAL_NUMERIC) +log1p_op = create_unary_op(name="log1p", type_signature=op_typing.UNARY_REAL_NUMERIC) +sqrt_op = create_unary_op(name="sqrt", type_signature=op_typing.UNARY_REAL_NUMERIC) # Parameterized unary ops @@ -261,7 +329,7 @@ class StrContainsOp(UnaryOp): pat: str def output_type(self, *input_types): - return dtypes.BOOL_DTYPE + return op_typing.STRING_PREDICATE.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -270,7 +338,7 @@ class StrContainsRegexOp(UnaryOp): pat: str def output_type(self, *input_types): - return dtypes.BOOL_DTYPE + return op_typing.STRING_PREDICATE.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -279,7 +347,7 @@ class StrGetOp(UnaryOp): i: int def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -290,7 +358,7 @@ class StrPadOp(UnaryOp): side: typing.Literal["both", "left", "right"] def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -300,7 +368,7 @@ class ReplaceStrOp(UnaryOp): repl: str def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -310,7 +378,7 @@ class RegexReplaceStrOp(UnaryOp): repl: str def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -319,7 +387,7 @@ class StartsWithOp(UnaryOp): pat: typing.Sequence[str] def output_type(self, *input_types): - return dtypes.BOOL_DTYPE + return op_typing.STRING_PREDICATE.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -328,7 +396,7 @@ class EndsWithOp(UnaryOp): pat: typing.Sequence[str] def output_type(self, *input_types): - return dtypes.BOOL_DTYPE + return op_typing.STRING_PREDICATE.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -337,7 +405,7 @@ class ZfillOp(UnaryOp): width: int def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -348,7 +416,10 @@ class StrFindOp(UnaryOp): end: typing.Optional[int] def output_type(self, *input_types): - return dtypes.INT_DTYPE + signature = op_typing.FixedOutputType( + dtypes.is_string_like, dtypes.INT_DTYPE, "string-like" + ) + return signature.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -358,7 +429,7 @@ class StrExtractOp(UnaryOp): n: int = 1 def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -368,7 +439,7 @@ class StrSliceOp(UnaryOp): end: typing.Optional[int] def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) @dataclasses.dataclass(frozen=True) @@ -377,7 +448,7 @@ class StrRepeatOp(UnaryOp): repeats: int def output_type(self, *input_types): - return dtypes.STRING_DTYPE + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) # Other parameterized unary operations @@ -387,9 +458,15 @@ class StructFieldOp(UnaryOp): name_or_index: str | int def output_type(self, *input_types): - pd_type = typing.cast(pd.ArrowDtype, input_types[0]) - pa_struct_t = typing.cast(pa.StructType, pd_type.pyarrow_dtype) - pa_result_type = pa_struct_t[self.name_or_index].type + input_type = input_types[0] + if not isinstance(input_type, pd.ArrowDtype): + raise TypeError("field accessor input must be a struct type") + + pa_type = input_type.pyarrow_dtype + if not isinstance(pa_type, pa.StructType): + raise TypeError("field accessor input must be a struct type") + + pa_result_type = pa_type[self.name_or_index].type # TODO: Directly convert from arrow to pandas type ibis_result_type = dtypes.arrow_dtype_to_ibis_dtype(pa_result_type) return dtypes.ibis_dtype_to_bigframes_dtype(ibis_result_type) @@ -471,37 +548,94 @@ def output_type(self, *input_types): # Binary Ops -fillna_op = create_binary_op(name="fillna") -cliplower_op = create_binary_op(name="clip_lower") -clipupper_op = create_binary_op(name="clip_upper") -coalesce_op = create_binary_op(name="coalesce") +fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) +cliplower_op = create_binary_op(name="clip_lower", type_signature=op_typing.COERCE) +clipupper_op = create_binary_op(name="clip_upper", type_signature=op_typing.COERCE) +coalesce_op = create_binary_op(name="coalesce", type_signature=op_typing.COERCE) + + ## Math Ops -add_op = create_binary_op(name="add", type_rule=op_typing.NUMERIC) -sub_op = create_binary_op(name="sub", type_rule=op_typing.NUMERIC) -mul_op = create_binary_op(name="mul", type_rule=op_typing.NUMERIC) -div_op = create_binary_op(name="div", type_rule=op_typing.REAL_NUMERIC) -floordiv_op = create_binary_op(name="floordiv", type_rule=op_typing.NUMERIC) -pow_op = create_binary_op(name="pow", type_rule=op_typing.NUMERIC) -mod_op = create_binary_op(name="mod", type_rule=op_typing.NUMERIC) -round_op = create_binary_op(name="round", type_rule=op_typing.REAL_NUMERIC) -unsafe_pow_op = create_binary_op(name="unsafe_pow_op", type_rule=op_typing.REAL_NUMERIC) +@dataclasses.dataclass(frozen=True) +class AddOp(BinaryOp): + name: typing.ClassVar[str] = "add" + + def output_type(self, *input_types): + left_type = input_types[0] + right_type = input_types[1] + if all(map(dtypes.is_string_like, input_types)) and len(set(input_types)) == 1: + # String addition + return input_types[0] + if (left_type is None or dtypes.is_numeric(left_type)) and ( + right_type is None or dtypes.is_numeric(right_type) + ): + # Numeric addition + return dtypes.coerce_to_common(left_type, right_type) + # TODO: Add temporal addition once delta types supported + raise TypeError(f"Cannot add dtypes {left_type} and {right_type}") + + +@dataclasses.dataclass(frozen=True) +class SubOp(BinaryOp): + name: typing.ClassVar[str] = "sub" + + # Note: this is actualyl a vararg op, but we don't model that yet + def output_type(self, *input_types): + left_type = input_types[0] + right_type = input_types[1] + if (left_type is None or dtypes.is_numeric(left_type)) and ( + right_type is None or dtypes.is_numeric(right_type) + ): + # Numeric subtraction + return dtypes.coerce_to_common(left_type, right_type) + # TODO: Add temporal addition once delta types supported + raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") + + +add_op = AddOp() +sub_op = SubOp() +mul_op = create_binary_op(name="mul", type_signature=op_typing.BINARY_NUMERIC) +div_op = create_binary_op(name="div", type_signature=op_typing.BINARY_REAL_NUMERIC) +floordiv_op = create_binary_op(name="floordiv", type_signature=op_typing.BINARY_NUMERIC) +pow_op = create_binary_op(name="pow", type_signature=op_typing.BINARY_NUMERIC) +mod_op = create_binary_op(name="mod", type_signature=op_typing.BINARY_NUMERIC) +arctan2_op = create_binary_op( + name="arctan2", type_signature=op_typing.BINARY_REAL_NUMERIC +) +round_op = create_binary_op(name="round", type_signature=op_typing.BINARY_REAL_NUMERIC) +unsafe_pow_op = create_binary_op( + name="unsafe_pow_op", type_signature=op_typing.BINARY_REAL_NUMERIC +) # Logical Ops -and_op = create_binary_op(name="and") -or_op = create_binary_op(name="or") +and_op = create_binary_op(name="and", type_signature=op_typing.LOGICAL) +or_op = create_binary_op(name="or", type_signature=op_typing.LOGICAL) ## Comparison Ops -eq_op = create_binary_op(name="eq", type_rule=op_typing.PREDICATE) +eq_op = create_binary_op(name="eq", type_signature=op_typing.COMPARISON) eq_null_match_op = create_binary_op( - name="eq_nulls_match", type_rule=op_typing.PREDICATE + name="eq_nulls_match", type_signature=op_typing.COMPARISON ) -ne_op = create_binary_op(name="ne", type_rule=op_typing.PREDICATE) -lt_op = create_binary_op(name="lt", type_rule=op_typing.PREDICATE) -gt_op = create_binary_op(name="gt", type_rule=op_typing.PREDICATE) -le_op = create_binary_op(name="le", type_rule=op_typing.PREDICATE) -ge_op = create_binary_op(name="ge", type_rule=op_typing.PREDICATE) +ne_op = create_binary_op(name="ne", type_signature=op_typing.COMPARISON) +lt_op = create_binary_op(name="lt", type_signature=op_typing.COMPARISON) +gt_op = create_binary_op(name="gt", type_signature=op_typing.COMPARISON) +le_op = create_binary_op(name="le", type_signature=op_typing.COMPARISON) +ge_op = create_binary_op(name="ge", type_signature=op_typing.COMPARISON) + ## String Ops -strconcat_op = create_binary_op(name="strconcat", type_rule=op_typing.STRING) +@dataclasses.dataclass(frozen=True) +class StrConcatOp(BinaryOp): + name: typing.ClassVar[str] = "str_concat" + + # Note: this is actualyl a vararg op, but we don't model that yet + def output_type(self, *input_types): + if not all(map(dtypes.is_string_like, input_types)): + raise TypeError("string concat requires string-like arguments") + if len(set(input_types)) != 1: + raise TypeError("string concat requires like-typed arguments") + return input_types[0] + + +strconcat_op = StrConcatOp() # Ternary Ops @@ -510,15 +644,25 @@ class WhereOp(TernaryOp): name: typing.ClassVar[str] = "where" def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - # Second input is boolean and doesn't affect output type - return dtypes.lcd_etype(input_types[0], input_types[2]) + if input_types[1] != dtypes.BOOL_DTYPE: + raise TypeError("where condition must be a boolean") + return dtypes.coerce_to_common(input_types[0], input_types[2]) where_op = WhereOp() -clip_op = create_ternary_op(name="clip", type_rule=op_typing.Supertype()) +@dataclasses.dataclass(frozen=True) +class ClipOp(TernaryOp): + name: typing.ClassVar[str] = "clip" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return dtypes.coerce_to_common( + input_types[0], dtypes.coerce_to_common(input_types[1], input_types[2]) + ) + +clip_op = ClipOp() # Just parameterless unary ops for now # TODO: Parameter mappings @@ -540,6 +684,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT np.log10: log10_op, np.sqrt: sqrt_op, np.abs: abs_op, + np.floor: floor_op, + np.ceil: ceil_op, + np.log1p: log1p_op, + np.expm1: expm1_op, } @@ -549,4 +697,5 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT np.multiply: mul_op, np.divide: div_op, np.power: pow_op, + np.arctan2: arctan2_op, } diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index ad5abb4bca..04534e20a9 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -14,7 +14,6 @@ import abc import typing -import uuid import pandas as pd @@ -98,6 +97,12 @@ def __init__(self, data, **kwargs) -> None: f"Only support a single color string or a column name/posision. {constants.FEEDBACK_LINK}" ) + s = self.kwargs.get("s", None) + if self._is_sequence_arg(s): + raise NotImplementedError( + f"Only support a single color string or a column name/posision. {constants.FEEDBACK_LINK}" + ) + def _compute_plot_data(self): sample = self._compute_sample_data(self.data) @@ -109,6 +114,18 @@ def _compute_plot_data(self): if self._is_column_name(c, sample) and sample[c].dtype == dtypes.STRING_DTYPE: sample[c] = sample[c].astype("object") + # To avoid Matplotlib's automatic conversion of `Float64` or `Int64` columns + # to `object` types (which breaks float-like behavior), this code proactively + # converts the column to a compatible format. + s = self.kwargs.get("s", None) + if pd.core.dtypes.common.is_integer(s): + s = self.data.columns[s] + if self._is_column_name(s, sample): + if sample[s].dtype == dtypes.INT_DTYPE: + sample[s] = sample[s].astype("int64") + elif sample[s].dtype == dtypes.FLOAT_DTYPE: + sample[s] = sample[s].astype("float64") + return sample def _is_sequence_arg(self, arg): @@ -124,9 +141,3 @@ def _is_column_name(self, arg, data): and pd.core.dtypes.common.is_hashable(arg) and arg in data.columns ) - - def _generate_new_column_name(self, data): - col_name = None - while col_name is None or col_name in data.columns: - col_name = f"plot_temp_{str(uuid.uuid4())[:8]}" - return col_name diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 9a270f1ce7..76aa2a6112 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -139,10 +139,7 @@ class ProductOp(UnaryAggregateOp): name: ClassVar[str] = "product" def output_type(self, *input_types: dtypes.ExpressionType): - if pd.api.types.is_bool_dtype(input_types[0]): - return dtypes.INT_DTYPE - else: - return input_types[0] + return dtypes.FLOAT_DTYPE @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index 30e0c1e745..f469070805 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -12,64 +12,203 @@ # See the License for the specific language governing permissions and # limitations under the License. +import abc import dataclasses -import functools +from typing import Callable import bigframes.dtypes from bigframes.dtypes import ExpressionType -# TODO: Apply input type constraints to help pre-empt invalid expression construction - @dataclasses.dataclass -class OpTypeRule: - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - raise NotImplementedError("Abstract typing rule has no output type") +class TypeSignature(abc.ABC): + """ + Type Signature represent a mapping from input types to output type. + + Type signatures should throw a TypeError if the input types cannot be handled by the operation. + """ + + @property + @abc.abstractmethod + def as_method(self): + """Convert the signature into an object method. Convenience function for constructing ops that use the signature.""" + ... + + +class UnaryTypeSignature(TypeSignature): + @abc.abstractmethod + def output_type(self, input_type: ExpressionType) -> ExpressionType: + ... + + @property + def as_method(self): + def meth(_, *input_types: ExpressionType) -> ExpressionType: + assert len(input_types) == 1 + return self.output_type(input_types[0]) + + return meth + + +class BinaryTypeSignature(TypeSignature): + @abc.abstractmethod + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + ... @property def as_method(self): def meth(_, *input_types: ExpressionType) -> ExpressionType: - return self.output_type(*input_types) + assert len(input_types) == 2 + return self.output_type(input_types[0], input_types[1]) return meth @dataclasses.dataclass -class InputType(OpTypeRule): - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - assert len(input_types) == 1 - return input_types[0] +class TypePreserving(UnaryTypeSignature): + type_predicate: Callable[[ExpressionType], bool] + description: str + + def output_type(self, input_type: ExpressionType) -> ExpressionType: + if not self.type_predicate(input_type): + raise TypeError( + f"Type {input_type} is not supported. Type must be {self.description}" + ) + return input_type + + +@dataclasses.dataclass +class FixedOutputType(UnaryTypeSignature): + type_predicate: Callable[[ExpressionType], bool] + fixed_type: ExpressionType + description: str + + def output_type(self, input_type: ExpressionType) -> ExpressionType: + if (input_type is not None) and not self.type_predicate(input_type): + raise TypeError( + f"Type {input_type} is not supported. Type must be {self.description}" + ) + return self.fixed_type + + +@dataclasses.dataclass +class UnaryRealNumeric(UnaryTypeSignature): + """Type signature for real-valued functions like exp, log, sin, tan.""" + + def output_type(self, type: ExpressionType) -> ExpressionType: + if type is None: + return bigframes.dtypes.FLOAT_DTYPE + if not bigframes.dtypes.is_numeric(type): + raise TypeError(f"Type {type} is not numeric") + if type in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.BOOL_DTYPE): + # Real numeric ops produce floats on int input + return bigframes.dtypes.FLOAT_DTYPE + return type @dataclasses.dataclass -class RealNumeric(OpTypeRule): - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - return functools.reduce( - lambda t1, t2: bigframes.dtypes.lcd_etype(t1, t2), - [*input_types, bigframes.dtypes.FLOAT_DTYPE], - ) +class BinaryNumeric(BinaryTypeSignature): + """Type signature for numeric functions like multiply, modulo that can map ints to ints.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if (left_type is not None) and not bigframes.dtypes.is_numeric(left_type): + raise TypeError(f"Type {left_type} is not numeric") + if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): + raise TypeError(f"Type {right_type} is not numeric") + return bigframes.dtypes.coerce_to_common(left_type, right_type) @dataclasses.dataclass -class Supertype(OpTypeRule): - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - return functools.reduce( - lambda t1, t2: bigframes.dtypes.lcd_etype(t1, t2), input_types - ) +class BinaryRealNumeric(BinaryTypeSignature): + """Type signature for real-valued functions like divide, arctan2, pow.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if (left_type is not None) and not bigframes.dtypes.is_numeric(left_type): + raise TypeError(f"Type {left_type} is not numeric") + if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): + raise TypeError(f"Type {right_type} is not numeric") + lcd_type = bigframes.dtypes.coerce_to_common(left_type, right_type) + if lcd_type == bigframes.dtypes.INT_DTYPE: + # Real numeric ops produce floats on int input + return bigframes.dtypes.FLOAT_DTYPE + return lcd_type @dataclasses.dataclass -class Fixed(OpTypeRule): - out_type: ExpressionType +class CoerceCommon(BinaryTypeSignature): + """Attempt to coerce inputs to a compatible type.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + try: + return bigframes.dtypes.coerce_to_common(left_type, right_type) + except TypeError: + pass + if bigframes.dtypes.can_coerce(left_type, right_type): + return right_type + if bigframes.dtypes.can_coerce(right_type, left_type): + return left_type + raise TypeError(f"Cannot coerce {left_type} and {right_type} to a common type.") - def output_type(self, *input_types: ExpressionType) -> ExpressionType: - return self.out_type +@dataclasses.dataclass +class Comparison(BinaryTypeSignature): + """Type signature for comparison operators.""" -# Common type rules -NUMERIC = Supertype() -REAL_NUMERIC = RealNumeric() -PREDICATE = Fixed(bigframes.dtypes.BOOL_DTYPE) -INTEGER = Fixed(bigframes.dtypes.INT_DTYPE) -STRING = Fixed(bigframes.dtypes.STRING_DTYPE) -INPUT_TYPE = InputType() + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + common_type = CoerceCommon().output_type(left_type, right_type) + if not bigframes.dtypes.is_comparable(common_type): + raise TypeError(f"Types {left_type} and {right_type} are not comparable") + return bigframes.dtypes.BOOL_DTYPE + + +@dataclasses.dataclass +class Logical(BinaryTypeSignature): + """Type signature for logical operators like AND, OR and NOT.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if left_type is None or right_type is None: + return bigframes.dtypes.BOOL_DTYPE + if not bigframes.dtypes.is_binary_like(left_type): + raise TypeError(f"Type {left_type} is not binary") + if not bigframes.dtypes.is_binary_like(right_type): + raise TypeError(f"Type {right_type} is not binary") + if left_type != right_type: + raise TypeError( + "Bitwise operands {left_type} and {right_type} do not match" + ) + return left_type + + +# Common type signatures +UNARY_NUMERIC = TypePreserving(bigframes.dtypes.is_numeric, description="numeric") +UNARY_REAL_NUMERIC = UnaryRealNumeric() +BINARY_NUMERIC = BinaryNumeric() +BINARY_REAL_NUMERIC = BinaryRealNumeric() +COMPARISON = Comparison() +COERCE = CoerceCommon() +LOGICAL = Logical() +STRING_TRANSFORM = TypePreserving( + bigframes.dtypes.is_string_like, description="numeric" +) +STRING_PREDICATE = FixedOutputType( + bigframes.dtypes.is_string_like, + bigframes.dtypes.BOOL_DTYPE, + description="string-like", +) +DATELIKE_ACCESSOR = FixedOutputType( + bigframes.dtypes.is_date_like, bigframes.dtypes.INT_DTYPE, description="date-like" +) +TIMELIKE_ACCESSOR = FixedOutputType( + bigframes.dtypes.is_time_like, bigframes.dtypes.INT_DTYPE, description="time-like" +) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index b6476c5eb8..4b0ac4310c 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -577,7 +577,22 @@ def read_gbq_table( read_gbq_table.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_table) +@typing.overload def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataFrame: + ... + + +@typing.overload +def read_pandas(pandas_dataframe: pandas.Series) -> bigframes.series.Series: + ... + + +@typing.overload +def read_pandas(pandas_dataframe: pandas.Index) -> bigframes.core.indexes.Index: + ... + + +def read_pandas(pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index]): return global_session.with_default_session( bigframes.session.Session.read_pandas, pandas_dataframe, @@ -714,9 +729,13 @@ def to_datetime( # which the applicable limit is now hard coded. See: # https://github.com/python/cpython/issues/112282 sys.setrecursionlimit(max(10000000, sys.getrecursionlimit())) -resource.setrlimit( - resource.RLIMIT_STACK, (resource.RLIM_INFINITY, resource.RLIM_INFINITY) -) + +soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_STACK) +if soft_limit < hard_limit or hard_limit == resource.RLIM_INFINITY: + try: + resource.setrlimit(resource.RLIMIT_STACK, (hard_limit, hard_limit)) + except Exception: + pass # Use __all__ to let type checkers know what is part of the public API. __all___ = [ diff --git a/bigframes/series.py b/bigframes/series.py index e7b358c2fe..185891bc01 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -176,6 +176,7 @@ def __len__(self): return self.shape[0] def __iter__(self) -> typing.Iterator: + self._optimize_query_complexity() return itertools.chain.from_iterable( map(lambda x: x.squeeze(axis=1), self._block.to_pandas_batches()) ) @@ -328,6 +329,7 @@ def to_pandas( pandas.Series: A pandas Series with all rows of this Series if the data_sampling_threshold_mb is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. """ + self._optimize_query_complexity() df, query_job = self._block.to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, @@ -351,9 +353,11 @@ def drop( columns: Union[blocks.Label, typing.Iterable[blocks.Label]] = None, level: typing.Optional[LevelType] = None, ) -> Series: - if labels and index: - raise ValueError("Must specify exacly one of 'labels' or 'index'") - index = labels or index + if (labels is None) == (index is None): + raise ValueError("Must specify exactly one of 'labels' or 'index'") + + if labels is not None: + index = labels # ignore axis, columns params block = self._block @@ -1514,7 +1518,7 @@ def map( map_df = map_df.rename(columns={arg.name: self.name}) elif isinstance(arg, Mapping): map_df = bigframes.dataframe.DataFrame( - {"keys": list(arg.keys()), self.name: list(arg.values())}, + {"keys": list(arg.keys()), self.name: list(arg.values())}, # type: ignore session=self._get_block().expr.session, ) map_df = map_df.set_index("keys") @@ -1547,6 +1551,13 @@ def sample( )[0] ) + def explode(self, *, ignore_index: Optional[bool] = False) -> Series: + return Series( + self._block.explode( + column_ids=[self._value_column], ignore_index=ignore_index + ) + ) + def __array_ufunc__( self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs ) -> Series: @@ -1594,6 +1605,14 @@ def _cached(self, *, force: bool = True) -> Series: self._set_block(self._block.cached(force=force)) return self + def _optimize_query_complexity(self): + """Reduce query complexity by caching repeated subtrees and recursively materializing maximum-complexity subtrees. + May generate many queries and take substantial time to execute. + """ + # TODO: Move all this to session + new_expr = self._block.session._simplify_with_caching(self._block.expr) + self._set_block(self._block.swap_array_expr(new_expr)) + def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: return pandas.api.types.is_list_like(obj) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 479b3a7bac..354352f1c9 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -80,9 +80,11 @@ import bigframes.core.blocks as blocks import bigframes.core.compile import bigframes.core.guid as guid +import bigframes.core.nodes as nodes from bigframes.core.ordering import IntegerEncoding import bigframes.core.ordering as order -import bigframes.core.traversal as traversals +import bigframes.core.tree_properties as traversals +import bigframes.core.tree_properties as tree_properties import bigframes.core.utils as utils import bigframes.dtypes import bigframes.formatting_helpers as formatting_helpers @@ -94,7 +96,9 @@ # Avoid circular imports. if typing.TYPE_CHECKING: + import bigframes.core.indexes import bigframes.dataframe as dataframe + import bigframes.series _BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection" @@ -113,9 +117,14 @@ "UTF-32LE", } -# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. -# TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_DF_SIZE = 5000 +# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table. +# Also must assume that text encoding as literals is much less efficient than in-memory representation. +MAX_INLINE_DF_BYTES = 5000 + +# Max complexity that should be executed as a single query +QUERY_COMPLEXITY_LIMIT = 1e7 +# Number of times to factor out subqueries before giving up. +MAX_SUBTREE_FACTORINGS = 5 logger = logging.getLogger(__name__) @@ -952,7 +961,7 @@ def read_gbq_model(self, model_name: str): to load from the default project. Returns: - A bigframes.ml Model wrapping the model. + A bigframes.ml Model, Transformer or Pipeline wrapping the model. """ import bigframes.ml.loader @@ -962,7 +971,23 @@ def read_gbq_model(self, model_name: str): model = self.bqclient.get_model(model_ref) return bigframes.ml.loader.from_bq(self, model) + @typing.overload + def read_pandas( + self, pandas_dataframe: pandas.Index + ) -> bigframes.core.indexes.Index: + ... + + @typing.overload + def read_pandas(self, pandas_dataframe: pandas.Series) -> bigframes.series.Series: + ... + + @typing.overload def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame: + ... + + def read_pandas( + self, pandas_dataframe: Union[pandas.DataFrame, pandas.Series, pandas.Index] + ): """Loads DataFrame from a pandas DataFrame. The pandas DataFrame will be persisted as a temporary BigQuery table, which can be @@ -985,13 +1010,31 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame [2 rows x 2 columns] Args: - pandas_dataframe (pandas.DataFrame): - a pandas DataFrame object to be loaded. + pandas_dataframe (pandas.DataFrame, pandas.Series, or pandas.Index): + a pandas DataFrame/Series/Index object to be loaded. Returns: - bigframes.dataframe.DataFrame: The BigQuery DataFrame. + An equivalent bigframes.pandas.(DataFrame/Series/Index) object """ - return self._read_pandas(pandas_dataframe, "read_pandas") + import bigframes.series as series + + # Try to handle non-dataframe pandas objects as well + if isinstance(pandas_dataframe, pandas.Series): + bf_df = self._read_pandas(pandas.DataFrame(pandas_dataframe), "read_pandas") + bf_series = typing.cast(series.Series, bf_df[bf_df.columns[0]]) + # wrapping into df can set name to 0 so reset to original object name + bf_series.name = pandas_dataframe.name + return bf_series + if isinstance(pandas_dataframe, pandas.Index): + return self._read_pandas( + pandas.DataFrame(index=pandas_dataframe), "read_pandas" + ).index + if isinstance(pandas_dataframe, pandas.DataFrame): + return self._read_pandas(pandas_dataframe, "read_pandas") + else: + raise ValueError( + f"read_pandas() expects a pandas.DataFrame, but got a {type(pandas_dataframe)}" + ) def _read_pandas( self, pandas_dataframe: pandas.DataFrame, api_name: str @@ -1014,7 +1057,7 @@ def _read_pandas_inline( ) -> Optional[dataframe.DataFrame]: import bigframes.dataframe as dataframe - if pandas_dataframe.size > MAX_INLINE_DF_SIZE: + if pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES: return None try: @@ -1814,6 +1857,52 @@ def _cache_with_offsets(self, array_value: core.ArrayValue) -> core.ArrayValue: ordering=order.ExpressionOrdering.from_offset_col("bigframes_offsets"), ) + def _simplify_with_caching(self, array_value: core.ArrayValue) -> core.ArrayValue: + """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" + if not bigframes.options.compute.enable_multi_query_execution: + return array_value + node = array_value.node + if node.planning_complexity < QUERY_COMPLEXITY_LIMIT: + return array_value + + for _ in range(MAX_SUBTREE_FACTORINGS): + updated = self._cache_most_complex_subtree(node) + if updated is None: + return core.ArrayValue(node) + else: + node = updated + + return core.ArrayValue(node) + + def _cache_most_complex_subtree( + self, node: nodes.BigFrameNode + ) -> Optional[nodes.BigFrameNode]: + # TODO: If query fails, retry with lower complexity limit + valid_candidates = traversals.count_complex_nodes( + node, + min_complexity=(QUERY_COMPLEXITY_LIMIT / 500), + max_complexity=QUERY_COMPLEXITY_LIMIT, + ).items() + # Heuristic: subtree_compleixty * (copies of subtree)^2 + best_candidate = max( + valid_candidates, + key=lambda i: i[0].planning_complexity + (i[1] ** 2), + default=None, + ) + + if best_candidate is None: + # No good subtrees to cache, just return original tree + return None + + # TODO: Add clustering columns based on access patterns + materialized = self._cache_with_cluster_cols( + core.ArrayValue(best_candidate[0]), [] + ).node + + return traversals.replace_nodes( + node, to_replace=best_candidate[0], replacemenet=materialized + ) + def _is_trivially_executable(self, array_value: core.ArrayValue): """ Can the block be evaluated very cheaply? @@ -1848,8 +1937,8 @@ def _peek( self, array_value: core.ArrayValue, n_rows: int ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """A 'peek' efficiently accesses a small number of rows in the dataframe.""" - if not array_value.node.peekable: - raise NotImplementedError("cannot efficient peek this dataframe") + if not tree_properties.peekable(array_value.node): + warnings.warn("Peeking this value cannot be done efficiently.") sql = self._compile_unordered(array_value).peek_sql(n_rows) return self._start_query( sql=sql, diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index d97e53901d..32f13fa00d 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -69,12 +69,12 @@ class ClientsProvider: def __init__( self, - project: Optional[str], - location: Optional[str], - use_regional_endpoints: Optional[bool], - credentials: Optional[google.auth.credentials.Credentials], - application_name: Optional[str], - bq_kms_key_name: Optional[str], + project: Optional[str] = None, + location: Optional[str] = None, + use_regional_endpoints: Optional[bool] = None, + credentials: Optional[google.auth.credentials.Credentials] = None, + application_name: Optional[str] = None, + bq_kms_key_name: Optional[str] = None, ): credentials_project = None if credentials is None: diff --git a/bigframes/typing.py b/bigframes/typing.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bigframes/version.py b/bigframes/version.py index 8e31592250..41a3895549 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.0.0" +__version__ = "1.1.0" diff --git a/docs/index.rst b/docs/index.rst index d239ea3a78..b17ac7cbd9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,6 +7,7 @@ API reference :maxdepth: 3 reference/index + supported_pandas_apis Changelog --------- diff --git a/docs/reference/bigframes.pandas/indexing.rst b/docs/reference/bigframes.pandas/indexing.rst index 8f7f194740..2cc1acfabf 100644 --- a/docs/reference/bigframes.pandas/indexing.rst +++ b/docs/reference/bigframes.pandas/indexing.rst @@ -3,7 +3,7 @@ Index objects ============= -.. autoclass:: bigframes.core.indexes.index.Index +.. autoclass:: bigframes.core.indexes.base.Index :members: :inherited-members: :undoc-members: diff --git a/docs/supported_pandas_apis.rst b/docs/supported_pandas_apis.rst new file mode 100644 index 0000000000..f4b57f05d1 --- /dev/null +++ b/docs/supported_pandas_apis.rst @@ -0,0 +1,62 @@ +Supported pandas APIs +===================== + +The following tables show the pandas APIs that have been implemented (or not) +in BigQuery DataFrames. + +* 'Y' means it implements all parameters. +* 'P' means it implements only some parameters. + +DataFrame +--------- + +.. raw:: html + :file: supported_pandas_apis/bf_dataframe.html + +DataFrameGroupBy +---------------- + +.. raw:: html + :file: supported_pandas_apis/bf_dataframegroupby.html + +Index +----- + +.. raw:: html + :file: supported_pandas_apis/bf_index.html + +pandas module +------------- + +.. raw:: html + :file: supported_pandas_apis/bf_pandas.html + +Series +------ + +.. raw:: html + :file: supported_pandas_apis/bf_series.html + +Series.dt methods +----------------- + +.. raw:: html + :file: supported_pandas_apis/bf_datetimemethods.html + +Series.str methods +------------------ + +.. raw:: html + :file: supported_pandas_apis/bf_stringmethods.html + +SeriesGroupBy +------------- + +.. raw:: html + :file: supported_pandas_apis/bf_seriesgroupby.html + +Window +------ + +.. raw:: html + :file: supported_pandas_apis/bf_window.html diff --git a/docs/supported_pandas_apis/.gitignore b/docs/supported_pandas_apis/.gitignore new file mode 100644 index 0000000000..2d19fc766d --- /dev/null +++ b/docs/supported_pandas_apis/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index c07e6141f1..3c2c688d78 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -40,6 +40,8 @@ - name: SeriesGroupBy uid: bigframes.core.groupby.SeriesGroupBy name: Groupby + - name: Index + uid: bigframes.core.indexes.base.Index - items: - name: AtDataFrameIndexer uid: bigframes.core.indexers.AtDataFrameIndexer @@ -60,6 +62,11 @@ name: Indexers - name: pandas uid: bigframes.pandas + - items: + - name: Plotting + uid: bigframes.operations.plotting + - name: PlotAccessor + uid: bigframes.operations.plotting.PlotAccessor - items: - name: Series uid: bigframes.series.Series @@ -72,6 +79,8 @@ name: Series - name: Window uid: bigframes.core.window.Window + - href: supported_pandas_apis.html + name: Supported pandas APIs name: bigframes.pandas - items: - items: diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index 15da075552..de9bb1d04f 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -1,5 +1,27 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "eeec3428", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/notebooks/dataframes/integrations.ipynb b/notebooks/dataframes/integrations.ipynb new file mode 100644 index 0000000000..735e18d94e --- /dev/null +++ b/notebooks/dataframes/integrations.ipynb @@ -0,0 +1,635 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Integrating with BigQuery DataFrames\n", + "\n", + "This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "# Sample data\n", + "df = bpd.DataFrame({\n", + " \"index\": [0, 1, 2, 3, 4],\n", + " \"int_col\": [1, 2, 3, 4, 5],\n", + " \"float_col\": [1.0, -0.5, 0.25, -0.125, 0.0625],\n", + " \"string_col\": [\"a\", \"b\", \"c\", \"d\", \"e\"],\n", + "}).set_index(\"index\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Accepting a BigQuery DataFrames (bigframes) DataFrame\n", + "\n", + "The recommended serialization format for a BigQuery DataFrames (bigframes) DataFrame is a BigQuery table. To write a DataFrame to a BigQuery table, use the `DataFrame.to_gbq()` method. With no `destination_table`, BigQuery DataFrames creates a table in the anonymous dataset corresponding to the BigQuery user & location and returns the corresponding table ID." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 00b5c727-f2bf-4265-be22-d7d505619db7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_43bbc4c64fb947f7b69db570a5641506'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table_id = df.to_gbq()\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sharing the table with your application's backend\n", + "\n", + "Tables created in the user's anonymous dataset are only queryable by the user who created them. Many applications authenticate with a [service account](https://cloud.google.com/iam/docs/service-account-overview), which may be different from the end-user running BigQuery DataFrames (bigframes).\n", + "\n", + "Grant your application access to this table by granting your application's service account associated with the customer the `roles/bigquery.dataViewer` role on the [BigQuery table with an IAM policy](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#grant_access_to_a_table_or_view)." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job f9c39ac2-a428-45c9-bb3a-643fc62a1c5b is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " index int_col float_col string_col\n", + "0 2 3 0.2500 c\n", + "1 4 5 0.0625 e\n", + "2 0 1 1.0000 a\n", + "3 1 2 -0.5000 b\n", + "4 3 4 -0.1250 d\n" + ] + } + ], + "source": [ + "# This sample assumes the client code knows which service account to share with.\n", + "your_service_account_email = \"your-service-account@bigframes-samples.iam.gserviceaccount.com\"\n", + "\n", + "\n", + "def df_to_gbq_plus_workoad(df):\n", + " table_id = df.to_gbq()\n", + "\n", + " bqclient = df.bqclient\n", + " policy = bqclient.get_iam_policy(table_id)\n", + " binding = {\n", + " \"role\": \"roles/bigquery.dataViewer\",\n", + " \"members\": {f\"serviceAccount:{your_service_account_email}\"},\n", + " }\n", + " policy.bindings.append(binding)\n", + " bqclient.set_iam_policy(table_id, policy)\n", + "\n", + " # TODO(developer): Pass table_id to your application and start your workload.\n", + " example_workload(table_id)\n", + "\n", + "\n", + "def example_workload(table_id):\n", + " # For example, for one node workloads, use the client library to read the table\n", + " # as a pandas DataFrame.\n", + " from google.cloud import bigquery\n", + "\n", + " # This sample assumes this client is authenticated as the user\n", + " # your_service_account_email.\n", + " client = bigquery.Client()\n", + " pandas_df = client.list_rows(table_id).to_dataframe()\n", + " print(pandas_df)\n", + "\n", + "\n", + "df_to_gbq_plus_workoad(df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ad53c7f2-e3bd-4667-b60b-b700c24b7a81 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " index int_col float_col string_col\n", + "0 4 5 0.0625 e\n", + "1 0 1 1.0000 a\n", + "2 2 3 0.2500 c\n", + "3 3 4 -0.1250 d\n", + "4 1 2 -0.5000 b\n" + ] + } + ], + "source": [ + "# This sample assumes the client code doesn't know which service account to share with.\n", + "\n", + "\n", + "def df_to_gbq_plus_workoad(df):\n", + " table_id = df.to_gbq()\n", + "\n", + " bqclient = df.bqclient\n", + " token = bqclient._credentials.token\n", + " project_id = bqclient.project\n", + "\n", + " share_table_and_start_workload(table_id, token, project_id)\n", + "\n", + "\n", + "def share_table_and_start_workload(table_id, token, project_id):\n", + " # This code runs in the backend for your application.\n", + " from google.cloud import bigquery\n", + " import google.oauth2.credentials\n", + "\n", + " # Note: these credentials don't have any way to be refreshed,\n", + " # so only use them long enough to share the table with the\n", + " # service account.\n", + " credentials = google.oauth2.credentials.Credentials(token)\n", + " bqclient = bigquery.Client(\n", + " project=project_id,\n", + " credentials=credentials,\n", + " )\n", + "\n", + " # This is assumed to only be available on the backend.\n", + " your_service_account_email = \"your-service-account@bigframes-samples.iam.gserviceaccount.com\"\n", + " policy = bqclient.get_iam_policy(table_id)\n", + " binding = {\n", + " \"role\": \"roles/bigquery.dataViewer\",\n", + " \"members\": {f\"serviceAccount:{your_service_account_email}\"},\n", + " }\n", + " policy.bindings.append(binding)\n", + " bqclient.set_iam_policy(table_id, policy)\n", + "\n", + " # Now that the table has been shared, bqclient with the temporary token\n", + " # is no longer needed.\n", + " example_workload(table_id)\n", + "\n", + "\n", + "def example_workload(table_id):\n", + " # For example, for one node workloads, use the client library to read the table\n", + " # as a pandas DataFrame.\n", + " from google.cloud import bigquery\n", + "\n", + " # This sample assumes this client is authenticated as the user\n", + " # your_service_account_email.\n", + " client = bigquery.Client()\n", + " pandas_df = client.list_rows(table_id).to_dataframe()\n", + " print(pandas_df)\n", + "\n", + "\n", + "df_to_gbq_plus_workoad(df)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preserving order\n", + "\n", + "Depending on your use case, you may want to include the ordering so that it can be restored withing your application." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 2aa7033c-c547-4ae2-a9aa-33272be82b9c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_b484a3967fba4a41850f4eb21b4b3bd8'" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ordering_column = \"ordering_id_maybe_with_some_random_text_to_avoid_collisions\"\n", + "table_id = df.to_gbq(ordering_id=ordering_column)\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating clustered tables\n", + "\n", + "Large tables can be optimized by passing in `clustering_columns` to create a [clustered table](https://cloud.google.com/bigquery/docs/clustered-tables)." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 1d489f94-2840-405e-9114-d439dcfcf7aa is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_d00699eeeed743b487c870dca5bcf23b'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table_id = df.to_gbq(clustering_columns=(\"index\", \"int_col\"))\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Returning a BigQuery DataFrames (bigframes) DataFrame\n", + "\n", + "The recommended way to construct a DataFrame is from a BigQuery table which has a unique primary key. By default a primary key is used as the index, which allows for more efficient queries than the default index generation.\n", + "\n", + "This sample assumes there is a shared dataset that\n", + "\n", + "1. The application can write to and\n", + "2. the bigframes user can read from.\n", + "\n", + "There are many ways an application can [write to a BigQuery table](https://cloud.google.com/bigquery/docs/loading-data), including BigQuery load jobs, DML, streaming REST API, and the BigQuery Write API. Each has different costs, performance, and limitations. Choose the one that best suits your application's needs." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(DatasetReference('swast-scratch', 'my_dataset'))" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The assumption is that there is a shared dataset to work with.\n", + "from google.cloud import bigquery\n", + "\n", + "bqclient = bigquery.Client()\n", + "bqclient.create_dataset(\"my_dataset\", exists_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 40977e60-97c3-4c93-89e2-d7334e5af71d is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 81e35bb8-2e27-4a18-b596-15a7805331f0 is DONE. 270 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statepostal_codepop
unique_index
2MI48105669
3GA303092581
5TX787015373
7CO803012087
11MA021422592
13IL606072630
17MI482012
19NC27701801
23CA926121115
29WA980334952
\n", + "

10 rows × 3 columns

\n", + "
[10 rows x 3 columns in total]" + ], + "text/plain": [ + " state postal_code pop\n", + "unique_index \n", + "2 MI 48105 669\n", + "3 GA 30309 2581\n", + "5 TX 78701 5373\n", + "7 CO 80301 2087\n", + "11 MA 02142 2592\n", + "13 IL 60607 2630\n", + "17 MI 48201 2\n", + "19 NC 27701 801\n", + "23 CA 92612 1115\n", + "29 WA 98033 4952\n", + "\n", + "[10 rows x 3 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For simplicity, this sample assumes your application uses\n", + "# a load job with the CSV file format.\n", + "# See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#python\n", + "import datetime\n", + "import io\n", + "import random\n", + "\n", + "\n", + "def create_table_for_bigframes():\n", + " # This code is assumed to run on the application's backend.\n", + " from google.cloud import bigquery\n", + "\n", + " client = bigquery.Client()\n", + "\n", + " # The end-user is expected to have read access to this table.\n", + " table_suffix = f\"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{random.randrange(1_000_000)}\"\n", + " table_id = f\"{client.project}.my_dataset.integrations_ipynb_{table_suffix}\"\n", + "\n", + " # Best practice: set the primary key to a unique column to use as the\n", + " # index and default ordering in a BigQuery DataFrames (bigframes) DataFrame.\n", + " # Having a unique identity column allows the DataFrame to be constructed\n", + " # more efficiently.\n", + " #\n", + " # Note 1: Even a random UUID would be helpful for efficiency.\n", + " #\n", + " # Note 2: Don't do this if you can't guarantee uniqueness, as the BigQuery\n", + " # query engine uses this property to optimize queries. Non-unique primary\n", + " # keys result in undefined behavior.\n", + " #\n", + " # Note 3: client.create_table doesn't support primary key, so instead\n", + " # use DDL to create the table.\n", + " create_table_ddl = f\"\"\"\n", + " CREATE OR REPLACE TABLE `{table_id}`\n", + " (\n", + " unique_index INT64,\n", + " state STRING,\n", + " postal_code STRING,\n", + " pop INT64,\n", + " PRIMARY KEY (unique_index) NOT ENFORCED\n", + " )\n", + " -- Clustering by the index column can make joins and loc operations more efficient.\n", + " -- Also cluster by columns which are expected to be used as common filters.\n", + " CLUSTER BY unique_index, state\n", + " \"\"\"\n", + " client.query_and_wait(create_table_ddl)\n", + "\n", + " csv_file = io.BytesIO(\n", + "b\"\"\"unique_index,state,postal_code,pop\n", + "2,MI,48105,669\n", + "3,GA,30309,2581\n", + "5,TX,78701,5373\n", + "7,CO,80301,2087\n", + "11,MA,02142,2592\n", + "13,IL,60607,2630\n", + "17,MI,48201,2\n", + "19,NC,27701,801\n", + "23,CA,92612,1115\n", + "29,WA,98033,4952\n", + "\"\"\"\n", + " )\n", + " job_config = bigquery.LoadJobConfig(\n", + " skip_leading_rows=1,\n", + " source_format=bigquery.SourceFormat.CSV,\n", + " )\n", + " load_job = client.load_table_from_file(\n", + " csv_file, table_id, job_config=job_config\n", + " )\n", + " load_job.result() # Waits for the job to complete.\n", + "\n", + " return table_id\n", + "\n", + "\n", + "table_id = create_table_for_bigframes()\n", + "\n", + "\n", + "# This is assumed to run on the client.\n", + "import bigframes.pandas as bpd\n", + "df = bpd.read_gbq_table(table_id, index_col=[\"unique_index\"])\n", + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bigframes", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 61445d85c5..ab6fd93f9a 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -289,7 +289,7 @@ { "data": { "text/html": [ - "Query job d5778724-6966-42ba-b8a6-2a1865a1184c is DONE. 2.3 GB processed. Open Job" + "Query job 952b852e-7cf0-493d-8258-fe60daf45ebf is DONE. 2.3 GB processed. Open Job" ], "text/plain": [ "" @@ -301,7 +301,7 @@ { "data": { "text/html": [ - "Query job 4d48bf69-571c-4773-8486-0232840597d5 is DONE. 55.1 MB processed. Open Job" + "Query job f9939880-6c66-4da5-9e90-daf8d9a9d83c is DONE. 50.3 MB processed. Open Job" ], "text/plain": [ "" @@ -336,36 +336,36 @@ " \n", " \n", " \n", - " 24\n", - " I sent disputed to Transunion, XXXX and XXXX f...\n", + " 1799560\n", + " Thursday, XX/XX/XXXX, unauthorized charges wer...\n", " \n", " \n", - " 942\n", - " on XX/XX/2017 I sent XXXX, transunion, XXXX pr...\n", + " 1800272\n", + " The credit reporting company is reporting inac...\n", " \n", " \n", - " 1193\n", - " On Wednesday, XXXX XXXX , I initiated a wir...\n", + " 1800409\n", + " In accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 1292\n", - " Dear Sir or Madam, I am a victim of identity t...\n", + " 1800550\n", + " I told the credit bureaus to \" investigate eve...\n", " \n", " \n", - " 1377\n", - " For the purpose of this complaint, I will refe...\n", + " 1800818\n", + " Im writing in reference regarding XXXXXXXX XXX...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " consumer_complaint_narrative\n", - "24 I sent disputed to Transunion, XXXX and XXXX f...\n", - "942 on XX/XX/2017 I sent XXXX, transunion, XXXX pr...\n", - "1193 On Wednesday, XXXX XXXX , I initiated a wir...\n", - "1292 Dear Sir or Madam, I am a victim of identity t...\n", - "1377 For the purpose of this complaint, I will refe..." + " consumer_complaint_narrative\n", + "1799560 Thursday, XX/XX/XXXX, unauthorized charges wer...\n", + "1800272 The credit reporting company is reporting inac...\n", + "1800409 In accordance with the Fair Credit Reporting a...\n", + "1800550 I told the credit bureaus to \" investigate eve...\n", + "1800818 Im writing in reference regarding XXXXXXXX XXX..." ] }, "execution_count": 7, @@ -418,7 +418,7 @@ { "data": { "text/html": [ - "Query job 15b352c2-783c-42b1-bc03-e5772f00381a is DONE. 0 Bytes processed. Open Job" + "Query job e3ff0549-f0ee-4508-bb4f-beea14bf54f5 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -444,7 +444,7 @@ { "data": { "text/html": [ - "Query job e2152e81-b736-4a68-a25a-c5eb2b03d734 is DONE. 1.3 GB processed. Open Job" + "Query job 5b3d8f8c-9e8d-4378-b4df-e3328300f17a is DONE. 1.3 GB processed. Open Job" ], "text/plain": [ "" @@ -456,7 +456,7 @@ { "data": { "text/html": [ - "Query job b1a3d20b-aee3-424c-a0c5-5b36f1177709 is DONE. 80.0 kB processed. Open Job" + "Query job f35c2982-4953-45fa-84bd-d0ce04e13c5e is DONE. 80.0 kB processed. Open Job" ], "text/plain": [ "" @@ -468,7 +468,7 @@ { "data": { "text/html": [ - "Query job 6b2fad50-cbc8-42ea-83c1-b5d3eaac10b9 is DONE. 20.0 kB processed. Open Job" + "Query job b70c55a3-b18b-4313-86b0-31f5b3b570fb is DONE. 20.0 kB processed. Open Job" ], "text/plain": [ "" @@ -480,7 +480,7 @@ { "data": { "text/html": [ - "Query job 31896ae6-fbb5-42fb-98c4-13bd19d1adfa is DONE. 0 Bytes processed. Open Job" + "Query job 2b2cfd9f-c713-4411-a3ca-1916cec84ff0 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -492,7 +492,7 @@ { "data": { "text/html": [ - "Query job 43f04543-f59b-4f1b-8598-c529324904be is DONE. 72.1 MB processed. Open Job" + "Query job 09cadae1-1c66-43cf-a76f-7495b0123006 is DONE. 71.9 MB processed. Open Job" ], "text/plain": [ "" @@ -530,179 +530,179 @@ " \n", " \n", " \n", - " 545\n", - " [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-...\n", - " {\"token_count\":178,\"truncated\":false}\n", + " 782\n", + " [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-...\n", + " {\"token_count\":121,\"truncated\":false}\n", " \n", - " My payments have been approximately {$89.00} w...\n", + " I 've sent multiple letters to this agency abo...\n", " \n", " \n", - " 614\n", - " [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-...\n", - " {\"token_count\":399,\"truncated\":false}\n", + " 795\n", + " [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-...\n", + " {\"token_count\":141,\"truncated\":false}\n", " \n", - " Hi, I have contacted Trans Union XXXX XXXX abo...\n", + " I receive social security XXXX funds in my XXX...\n", " \n", " \n", - " 1236\n", - " [-5.32836001e-03 -5.84292673e-02 -5.86670786e-...\n", - " {\"token_count\":129,\"truncated\":false}\n", + " 861\n", + " [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " I have a XXXX XXXX XXXX credit card on my Exp...\n", + " Hello, My name is XXXX XXXX XXXX. I have a pro...\n", " \n", " \n", - " 1477\n", - " [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-...\n", - " {\"token_count\":16,\"truncated\":false}\n", + " 1103\n", + " [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-...\n", + " {\"token_count\":31,\"truncated\":false}\n", " \n", - " Wrongs information, selling my information to ...\n", + " The debt occurred more than 7 years in the pas...\n", " \n", " \n", - " 2261\n", - " [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-...\n", - " {\"token_count\":33,\"truncated\":false}\n", + " 1241\n", + " [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-...\n", + " {\"token_count\":23,\"truncated\":false}\n", " \n", - " Please investigate and delete disputed item th...\n", + " UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA...\n", " \n", " \n", - " 2361\n", - " [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-...\n", - " {\"token_count\":45,\"truncated\":false}\n", + " 1729\n", + " [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-...\n", + " {\"token_count\":382,\"truncated\":false}\n", " \n", - " By the provisions of the Fair Credit Reporting...\n", + " XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort...\n", " \n", " \n", - " 2378\n", - " [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-...\n", - " {\"token_count\":892,\"truncated\":false}\n", + " 2167\n", + " [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-...\n", + " {\"token_count\":556,\"truncated\":false}\n", " \n", - " Since XX/XX/XXXX I have been trying to dispute...\n", + " This is the third such complaint I have submit...\n", " \n", " \n", - " 3133\n", - " [ 0.00152804 -0.04189068 -0.04220504 -0.053740...\n", - " {\"token_count\":90,\"truncated\":false}\n", + " 2219\n", + " [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-...\n", + " {\"token_count\":196,\"truncated\":false}\n", " \n", - " Out of the blue I received a debt collection n...\n", + " Found and add online for a Prepaid Credit card...\n", " \n", " \n", - " 3140\n", - " [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-...\n", - " {\"token_count\":372,\"truncated\":false}\n", + " 2392\n", + " [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-...\n", + " {\"token_count\":641,\"truncated\":false}\n", " \n", - " My wife and I have been sending money to XXXX ...\n", + " I am furnishing this complaint against Fed Loa...\n", " \n", " \n", - " 3322\n", - " [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-...\n", - " {\"token_count\":36,\"truncated\":false}\n", + " 2528\n", + " [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-...\n", + " {\"token_count\":176,\"truncated\":false}\n", " \n", - " Phone calls from Convergent Outsourcing XXXX. ...\n", + " Despite multiple written requests, the unverif...\n", " \n", " \n", - " 3583\n", - " [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-...\n", - " {\"token_count\":52,\"truncated\":false}\n", + " 2737\n", + " [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-...\n", + " {\"token_count\":230,\"truncated\":false}\n", " \n", - " I recently received a copy of my credit report...\n", + " After unsatisfying communication in the messag...\n", " \n", " \n", - " 4134\n", - " [-7.04960374e-04 -3.52595337e-02 -1.65264793e-...\n", - " {\"token_count\":412,\"truncated\":false}\n", + " 2859\n", + " [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-...\n", + " {\"token_count\":238,\"truncated\":false}\n", " \n", - " I have been sending the creditor what they hav...\n", + " Good Morning. My name is XXXX XXXX. My account...\n", " \n", " \n", - " 4496\n", - " [ 3.67735326e-02 1.21120387e-03 -5.20942472e-...\n", - " {\"token_count\":182,\"truncated\":false}\n", + " 3439\n", + " [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-...\n", + " {\"token_count\":197,\"truncated\":false}\n", " \n", - " This is my second complaint. Their response to...\n", + " I have ongoing disputes that are preventing me...\n", " \n", " \n", - " 5260\n", - " [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-...\n", - " {\"token_count\":103,\"truncated\":false}\n", + " 3738\n", + " [ 0.01422119 -0.01114973 -0.04438976 -0.024421...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte...\n", + " I had a loan with national Collegiate Trust. i...\n", " \n", " \n", - " 5400\n", - " [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-...\n", - " {\"token_count\":60,\"truncated\":false}\n", + " 3805\n", + " [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-...\n", + " {\"token_count\":477,\"truncated\":false}\n", " \n", - " Upon checking my XXXX credit report I noticed ...\n", + " Hi I am submitting this XXXX XXXX this isn't a...\n", " \n", " \n", - " 5425\n", - " [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-...\n", - " {\"token_count\":87,\"truncated\":false}\n", + " 3915\n", + " [-7.23852217e-03 -4.69538383e-02 -5.60489520e-...\n", + " {\"token_count\":116,\"truncated\":false}\n", " \n", - " Follow up to previous complaint XXXX XXXX XXXX...\n", + " portfolio is showin on my credit report with a...\n", " \n", " \n", - " 6014\n", - " [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-...\n", - " {\"token_count\":175,\"truncated\":false}\n", + " 3917\n", + " [-8.92711710e-03 -4.49132621e-02 -4.29662578e-...\n", + " {\"token_count\":71,\"truncated\":false}\n", " \n", - " My new XXXX lease was over always paid on time...\n", + " the company shared my information with another...\n", " \n", " \n", - " 8192\n", - " [ 0.01937891 -0.05466933 -0.06070872 -0.059028...\n", - " {\"token_count\":131,\"truncated\":false}\n", + " 4281\n", + " [-1.69487391e-02 -1.89835522e-02 -3.80971469e-...\n", + " {\"token_count\":130,\"truncated\":false}\n", " \n", - " I have no idea where this account cane from. B...\n", + " I tried to submit a teacher loan forgiveness a...\n", " \n", " \n", - " 8240\n", - " [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-...\n", - " {\"token_count\":87,\"truncated\":false}\n", + " 4470\n", + " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", + " {\"token_count\":200,\"truncated\":false}\n", " \n", - " I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F...\n", + " in accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 8720\n", - " [ 0.03133732 -0.03972461 -0.00178199 -0.035876...\n", - " {\"token_count\":645,\"truncated\":false}\n", + " 4915\n", + " [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-...\n", + " {\"token_count\":23,\"truncated\":false}\n", " \n", - " XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum...\n", + " XXXX XXXX did not give me a receipt or a copy ...\n", " \n", " \n", - " 8914\n", - " [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-...\n", - " {\"token_count\":180,\"truncated\":false}\n", + " 4928\n", + " [-4.43694415e-03 -3.66776163e-04 -9.08496231e-...\n", + " {\"token_count\":83,\"truncated\":false}\n", " \n", - " On XX/XX/21 I sent a letter regarding inaccura...\n", + " This company has filed a civil suit during a g...\n", " \n", " \n", - " 10021\n", - " [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-...\n", - " {\"token_count\":30,\"truncated\":false}\n", + " 5338\n", + " [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-...\n", + " {\"token_count\":1279,\"truncated\":false}\n", " \n", - " XX/XX/XXXX and XX/XX/XXXX inaccurate informati...\n", + " My credit report contains errors that is keepi...\n", " \n", " \n", - " 10327\n", - " [-0.00979626 -0.04912931 -0.08654705 -0.021063...\n", - " {\"token_count\":194,\"truncated\":false}\n", + " 5582\n", + " [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-...\n", + " {\"token_count\":396,\"truncated\":false}\n", " \n", - " When I reviewed my credit report, I discovered...\n", + " Coast Professional, XXXX, LA contacted me by m...\n", " \n", " \n", - " 10345\n", - " [-0.04292191 -0.02636929 -0.06177032 -0.076520...\n", - " {\"token_count\":262,\"truncated\":false}\n", + " 6386\n", + " [ 3.33276950e-02 1.53224478e-02 -1.89354066e-...\n", + " {\"token_count\":79,\"truncated\":false}\n", " \n", - " U.S. Bank sent two letters containing Visa Deb...\n", + " Cares act refund requested in XXXX, called mul...\n", " \n", " \n", - " 10369\n", - " [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-...\n", - " {\"token_count\":77,\"truncated\":false}\n", + " 6956\n", + " [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-...\n", + " {\"token_count\":194,\"truncated\":false}\n", " \n", - " I requested from XXXX that they reverse the la...\n", + " n accordance with the Fair Credit Reporting ac...\n", " \n", " \n", "\n", @@ -710,86 +710,86 @@ "[10000 rows x 4 columns in total]" ], "text/plain": [ - " text_embedding \\\n", - "545 [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-... \n", - "614 [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-... \n", - "1236 [-5.32836001e-03 -5.84292673e-02 -5.86670786e-... \n", - "1477 [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-... \n", - "2261 [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-... \n", - "2361 [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-... \n", - "2378 [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-... \n", - "3133 [ 0.00152804 -0.04189068 -0.04220504 -0.053740... \n", - "3140 [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-... \n", - "3322 [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-... \n", - "3583 [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-... \n", - "4134 [-7.04960374e-04 -3.52595337e-02 -1.65264793e-... \n", - "4496 [ 3.67735326e-02 1.21120387e-03 -5.20942472e-... \n", - "5260 [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-... \n", - "5400 [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-... \n", - "5425 [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-... \n", - "6014 [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-... \n", - "8192 [ 0.01937891 -0.05466933 -0.06070872 -0.059028... \n", - "8240 [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-... \n", - "8720 [ 0.03133732 -0.03972461 -0.00178199 -0.035876... \n", - "8914 [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-... \n", - "10021 [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-... \n", - "10327 [-0.00979626 -0.04912931 -0.08654705 -0.021063... \n", - "10345 [-0.04292191 -0.02636929 -0.06177032 -0.076520... \n", - "10369 [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-... \n", + " text_embedding \\\n", + "782 [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-... \n", + "795 [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-... \n", + "861 [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-... \n", + "1103 [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-... \n", + "1241 [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-... \n", + "1729 [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-... \n", + "2167 [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-... \n", + "2219 [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-... \n", + "2392 [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-... \n", + "2528 [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-... \n", + "2737 [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-... \n", + "2859 [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-... \n", + "3439 [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-... \n", + "3738 [ 0.01422119 -0.01114973 -0.04438976 -0.024421... \n", + "3805 [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-... \n", + "3915 [-7.23852217e-03 -4.69538383e-02 -5.60489520e-... \n", + "3917 [-8.92711710e-03 -4.49132621e-02 -4.29662578e-... \n", + "4281 [-1.69487391e-02 -1.89835522e-02 -3.80971469e-... \n", + "4470 [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-... \n", + "4915 [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-... \n", + "4928 [-4.43694415e-03 -3.66776163e-04 -9.08496231e-... \n", + "5338 [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-... \n", + "5582 [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-... \n", + "6386 [ 3.33276950e-02 1.53224478e-02 -1.89354066e-... \n", + "6956 [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-... \n", "\n", " statistics ml_embed_text_status \\\n", - "545 {\"token_count\":178,\"truncated\":false} \n", - "614 {\"token_count\":399,\"truncated\":false} \n", - "1236 {\"token_count\":129,\"truncated\":false} \n", - "1477 {\"token_count\":16,\"truncated\":false} \n", - "2261 {\"token_count\":33,\"truncated\":false} \n", - "2361 {\"token_count\":45,\"truncated\":false} \n", - "2378 {\"token_count\":892,\"truncated\":false} \n", - "3133 {\"token_count\":90,\"truncated\":false} \n", - "3140 {\"token_count\":372,\"truncated\":false} \n", - "3322 {\"token_count\":36,\"truncated\":false} \n", - "3583 {\"token_count\":52,\"truncated\":false} \n", - "4134 {\"token_count\":412,\"truncated\":false} \n", - "4496 {\"token_count\":182,\"truncated\":false} \n", - "5260 {\"token_count\":103,\"truncated\":false} \n", - "5400 {\"token_count\":60,\"truncated\":false} \n", - "5425 {\"token_count\":87,\"truncated\":false} \n", - "6014 {\"token_count\":175,\"truncated\":false} \n", - "8192 {\"token_count\":131,\"truncated\":false} \n", - "8240 {\"token_count\":87,\"truncated\":false} \n", - "8720 {\"token_count\":645,\"truncated\":false} \n", - "8914 {\"token_count\":180,\"truncated\":false} \n", - "10021 {\"token_count\":30,\"truncated\":false} \n", - "10327 {\"token_count\":194,\"truncated\":false} \n", - "10345 {\"token_count\":262,\"truncated\":false} \n", - "10369 {\"token_count\":77,\"truncated\":false} \n", + "782 {\"token_count\":121,\"truncated\":false} \n", + "795 {\"token_count\":141,\"truncated\":false} \n", + "861 {\"token_count\":160,\"truncated\":false} \n", + "1103 {\"token_count\":31,\"truncated\":false} \n", + "1241 {\"token_count\":23,\"truncated\":false} \n", + "1729 {\"token_count\":382,\"truncated\":false} \n", + "2167 {\"token_count\":556,\"truncated\":false} \n", + "2219 {\"token_count\":196,\"truncated\":false} \n", + "2392 {\"token_count\":641,\"truncated\":false} \n", + "2528 {\"token_count\":176,\"truncated\":false} \n", + "2737 {\"token_count\":230,\"truncated\":false} \n", + "2859 {\"token_count\":238,\"truncated\":false} \n", + "3439 {\"token_count\":197,\"truncated\":false} \n", + "3738 {\"token_count\":160,\"truncated\":false} \n", + "3805 {\"token_count\":477,\"truncated\":false} \n", + "3915 {\"token_count\":116,\"truncated\":false} \n", + "3917 {\"token_count\":71,\"truncated\":false} \n", + "4281 {\"token_count\":130,\"truncated\":false} \n", + "4470 {\"token_count\":200,\"truncated\":false} \n", + "4915 {\"token_count\":23,\"truncated\":false} \n", + "4928 {\"token_count\":83,\"truncated\":false} \n", + "5338 {\"token_count\":1279,\"truncated\":false} \n", + "5582 {\"token_count\":396,\"truncated\":false} \n", + "6386 {\"token_count\":79,\"truncated\":false} \n", + "6956 {\"token_count\":194,\"truncated\":false} \n", "\n", - " content \n", - "545 My payments have been approximately {$89.00} w... \n", - "614 Hi, I have contacted Trans Union XXXX XXXX abo... \n", - "1236 I have a XXXX XXXX XXXX credit card on my Exp... \n", - "1477 Wrongs information, selling my information to ... \n", - "2261 Please investigate and delete disputed item th... \n", - "2361 By the provisions of the Fair Credit Reporting... \n", - "2378 Since XX/XX/XXXX I have been trying to dispute... \n", - "3133 Out of the blue I received a debt collection n... \n", - "3140 My wife and I have been sending money to XXXX ... \n", - "3322 Phone calls from Convergent Outsourcing XXXX. ... \n", - "3583 I recently received a copy of my credit report... \n", - "4134 I have been sending the creditor what they hav... \n", - "4496 This is my second complaint. Their response to... \n", - "5260 XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte... \n", - "5400 Upon checking my XXXX credit report I noticed ... \n", - "5425 Follow up to previous complaint XXXX XXXX XXXX... \n", - "6014 My new XXXX lease was over always paid on time... \n", - "8192 I have no idea where this account cane from. B... \n", - "8240 I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F... \n", - "8720 XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum... \n", - "8914 On XX/XX/21 I sent a letter regarding inaccura... \n", - "10021 XX/XX/XXXX and XX/XX/XXXX inaccurate informati... \n", - "10327 When I reviewed my credit report, I discovered... \n", - "10345 U.S. Bank sent two letters containing Visa Deb... \n", - "10369 I requested from XXXX that they reverse the la... \n", + " content \n", + "782 I 've sent multiple letters to this agency abo... \n", + "795 I receive social security XXXX funds in my XXX... \n", + "861 Hello, My name is XXXX XXXX XXXX. I have a pro... \n", + "1103 The debt occurred more than 7 years in the pas... \n", + "1241 UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA... \n", + "1729 XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort... \n", + "2167 This is the third such complaint I have submit... \n", + "2219 Found and add online for a Prepaid Credit card... \n", + "2392 I am furnishing this complaint against Fed Loa... \n", + "2528 Despite multiple written requests, the unverif... \n", + "2737 After unsatisfying communication in the messag... \n", + "2859 Good Morning. My name is XXXX XXXX. My account... \n", + "3439 I have ongoing disputes that are preventing me... \n", + "3738 I had a loan with national Collegiate Trust. i... \n", + "3805 Hi I am submitting this XXXX XXXX this isn't a... \n", + "3915 portfolio is showin on my credit report with a... \n", + "3917 the company shared my information with another... \n", + "4281 I tried to submit a teacher loan forgiveness a... \n", + "4470 in accordance with the Fair Credit Reporting a... \n", + "4915 XXXX XXXX did not give me a receipt or a copy ... \n", + "4928 This company has filed a civil suit during a g... \n", + "5338 My credit report contains errors that is keepi... \n", + "5582 Coast Professional, XXXX, LA contacted me by m... \n", + "6386 Cares act refund requested in XXXX, called mul... \n", + "6956 n accordance with the Fair Credit Reporting ac... \n", "...\n", "\n", "[10000 rows x 4 columns]" @@ -822,7 +822,7 @@ { "data": { "text/html": [ - "Query job c78e1040-2a57-42f6-8fdb-5b9524846259 is DONE. 72.1 MB processed. Open Job" + "Query job 2c99b34a-1956-4de7-8330-898f1f25560b is DONE. 71.9 MB processed. Open Job" ], "text/plain": [ "" @@ -834,7 +834,7 @@ { "data": { "text/html": [ - "Query job 0986541b-3941-4387-b813-8888f53d149e is DONE. 0 Bytes processed. Open Job" + "Query job 3ffed5f8-935a-4a3f-a560-6416445e4868 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -846,7 +846,7 @@ { "data": { "text/html": [ - "Query job 754aadd2-fee6-495c-acef-506f4e13c062 is DONE. 72.6 MB processed. Open Job" + "Query job 7b55783a-6d8f-41b9-b404-73253140029a is DONE. 72.3 MB processed. Open Job" ], "text/plain": [ "" @@ -884,179 +884,179 @@ " \n", " \n", " \n", - " 545\n", - " [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-...\n", - " {\"token_count\":178,\"truncated\":false}\n", + " 782\n", + " [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-...\n", + " {\"token_count\":121,\"truncated\":false}\n", " \n", - " My payments have been approximately {$89.00} w...\n", + " I 've sent multiple letters to this agency abo...\n", " \n", " \n", - " 614\n", - " [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-...\n", - " {\"token_count\":399,\"truncated\":false}\n", + " 795\n", + " [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-...\n", + " {\"token_count\":141,\"truncated\":false}\n", " \n", - " Hi, I have contacted Trans Union XXXX XXXX abo...\n", + " I receive social security XXXX funds in my XXX...\n", " \n", " \n", - " 1236\n", - " [-5.32836001e-03 -5.84292673e-02 -5.86670786e-...\n", - " {\"token_count\":129,\"truncated\":false}\n", + " 861\n", + " [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " I have a XXXX XXXX XXXX credit card on my Exp...\n", + " Hello, My name is XXXX XXXX XXXX. I have a pro...\n", " \n", " \n", - " 1477\n", - " [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-...\n", - " {\"token_count\":16,\"truncated\":false}\n", + " 1103\n", + " [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-...\n", + " {\"token_count\":31,\"truncated\":false}\n", " \n", - " Wrongs information, selling my information to ...\n", + " The debt occurred more than 7 years in the pas...\n", " \n", " \n", - " 2261\n", - " [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-...\n", - " {\"token_count\":33,\"truncated\":false}\n", + " 1241\n", + " [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-...\n", + " {\"token_count\":23,\"truncated\":false}\n", " \n", - " Please investigate and delete disputed item th...\n", + " UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA...\n", " \n", " \n", - " 2361\n", - " [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-...\n", - " {\"token_count\":45,\"truncated\":false}\n", + " 1729\n", + " [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-...\n", + " {\"token_count\":382,\"truncated\":false}\n", " \n", - " By the provisions of the Fair Credit Reporting...\n", + " XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort...\n", " \n", " \n", - " 2378\n", - " [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-...\n", - " {\"token_count\":892,\"truncated\":false}\n", + " 2167\n", + " [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-...\n", + " {\"token_count\":556,\"truncated\":false}\n", " \n", - " Since XX/XX/XXXX I have been trying to dispute...\n", + " This is the third such complaint I have submit...\n", " \n", " \n", - " 3133\n", - " [ 0.00152804 -0.04189068 -0.04220504 -0.053740...\n", - " {\"token_count\":90,\"truncated\":false}\n", + " 2219\n", + " [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-...\n", + " {\"token_count\":196,\"truncated\":false}\n", " \n", - " Out of the blue I received a debt collection n...\n", + " Found and add online for a Prepaid Credit card...\n", " \n", " \n", - " 3140\n", - " [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-...\n", - " {\"token_count\":372,\"truncated\":false}\n", + " 2392\n", + " [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-...\n", + " {\"token_count\":641,\"truncated\":false}\n", " \n", - " My wife and I have been sending money to XXXX ...\n", + " I am furnishing this complaint against Fed Loa...\n", " \n", " \n", - " 3322\n", - " [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-...\n", - " {\"token_count\":36,\"truncated\":false}\n", + " 2528\n", + " [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-...\n", + " {\"token_count\":176,\"truncated\":false}\n", " \n", - " Phone calls from Convergent Outsourcing XXXX. ...\n", + " Despite multiple written requests, the unverif...\n", " \n", " \n", - " 3583\n", - " [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-...\n", - " {\"token_count\":52,\"truncated\":false}\n", + " 2737\n", + " [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-...\n", + " {\"token_count\":230,\"truncated\":false}\n", " \n", - " I recently received a copy of my credit report...\n", + " After unsatisfying communication in the messag...\n", " \n", " \n", - " 4134\n", - " [-7.04960374e-04 -3.52595337e-02 -1.65264793e-...\n", - " {\"token_count\":412,\"truncated\":false}\n", + " 2859\n", + " [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-...\n", + " {\"token_count\":238,\"truncated\":false}\n", " \n", - " I have been sending the creditor what they hav...\n", + " Good Morning. My name is XXXX XXXX. My account...\n", " \n", " \n", - " 4496\n", - " [ 3.67735326e-02 1.21120387e-03 -5.20942472e-...\n", - " {\"token_count\":182,\"truncated\":false}\n", + " 3439\n", + " [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-...\n", + " {\"token_count\":197,\"truncated\":false}\n", " \n", - " This is my second complaint. Their response to...\n", + " I have ongoing disputes that are preventing me...\n", " \n", " \n", - " 5260\n", - " [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-...\n", - " {\"token_count\":103,\"truncated\":false}\n", + " 3738\n", + " [ 0.01422119 -0.01114973 -0.04438976 -0.024421...\n", + " {\"token_count\":160,\"truncated\":false}\n", " \n", - " XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte...\n", + " I had a loan with national Collegiate Trust. i...\n", " \n", " \n", - " 5400\n", - " [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-...\n", - " {\"token_count\":60,\"truncated\":false}\n", + " 3805\n", + " [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-...\n", + " {\"token_count\":477,\"truncated\":false}\n", " \n", - " Upon checking my XXXX credit report I noticed ...\n", + " Hi I am submitting this XXXX XXXX this isn't a...\n", " \n", " \n", - " 5425\n", - " [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-...\n", - " {\"token_count\":87,\"truncated\":false}\n", + " 3915\n", + " [-7.23852217e-03 -4.69538383e-02 -5.60489520e-...\n", + " {\"token_count\":116,\"truncated\":false}\n", " \n", - " Follow up to previous complaint XXXX XXXX XXXX...\n", + " portfolio is showin on my credit report with a...\n", " \n", " \n", - " 6014\n", - " [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-...\n", - " {\"token_count\":175,\"truncated\":false}\n", + " 3917\n", + " [-8.92711710e-03 -4.49132621e-02 -4.29662578e-...\n", + " {\"token_count\":71,\"truncated\":false}\n", " \n", - " My new XXXX lease was over always paid on time...\n", + " the company shared my information with another...\n", " \n", " \n", - " 8192\n", - " [ 0.01937891 -0.05466933 -0.06070872 -0.059028...\n", - " {\"token_count\":131,\"truncated\":false}\n", + " 4281\n", + " [-1.69487391e-02 -1.89835522e-02 -3.80971469e-...\n", + " {\"token_count\":130,\"truncated\":false}\n", " \n", - " I have no idea where this account cane from. B...\n", + " I tried to submit a teacher loan forgiveness a...\n", " \n", " \n", - " 8240\n", - " [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-...\n", - " {\"token_count\":87,\"truncated\":false}\n", + " 4470\n", + " [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-...\n", + " {\"token_count\":200,\"truncated\":false}\n", " \n", - " I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F...\n", + " in accordance with the Fair Credit Reporting a...\n", " \n", " \n", - " 8720\n", - " [ 0.03133732 -0.03972461 -0.00178199 -0.035876...\n", - " {\"token_count\":645,\"truncated\":false}\n", + " 4915\n", + " [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-...\n", + " {\"token_count\":23,\"truncated\":false}\n", " \n", - " XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum...\n", + " XXXX XXXX did not give me a receipt or a copy ...\n", " \n", " \n", - " 8914\n", - " [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-...\n", - " {\"token_count\":180,\"truncated\":false}\n", + " 4928\n", + " [-4.43694415e-03 -3.66776163e-04 -9.08496231e-...\n", + " {\"token_count\":83,\"truncated\":false}\n", " \n", - " On XX/XX/21 I sent a letter regarding inaccura...\n", + " This company has filed a civil suit during a g...\n", " \n", " \n", - " 10021\n", - " [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-...\n", - " {\"token_count\":30,\"truncated\":false}\n", + " 5338\n", + " [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-...\n", + " {\"token_count\":1279,\"truncated\":false}\n", " \n", - " XX/XX/XXXX and XX/XX/XXXX inaccurate informati...\n", + " My credit report contains errors that is keepi...\n", " \n", " \n", - " 10327\n", - " [-0.00979626 -0.04912931 -0.08654705 -0.021063...\n", - " {\"token_count\":194,\"truncated\":false}\n", + " 5582\n", + " [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-...\n", + " {\"token_count\":396,\"truncated\":false}\n", " \n", - " When I reviewed my credit report, I discovered...\n", + " Coast Professional, XXXX, LA contacted me by m...\n", " \n", " \n", - " 10345\n", - " [-0.04292191 -0.02636929 -0.06177032 -0.076520...\n", - " {\"token_count\":262,\"truncated\":false}\n", + " 6386\n", + " [ 3.33276950e-02 1.53224478e-02 -1.89354066e-...\n", + " {\"token_count\":79,\"truncated\":false}\n", " \n", - " U.S. Bank sent two letters containing Visa Deb...\n", + " Cares act refund requested in XXXX, called mul...\n", " \n", " \n", - " 10369\n", - " [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-...\n", - " {\"token_count\":77,\"truncated\":false}\n", + " 6956\n", + " [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-...\n", + " {\"token_count\":194,\"truncated\":false}\n", " \n", - " I requested from XXXX that they reverse the la...\n", + " n accordance with the Fair Credit Reporting ac...\n", " \n", " \n", "\n", @@ -1064,86 +1064,86 @@ "[10000 rows x 4 columns in total]" ], "text/plain": [ - " text_embedding \\\n", - "545 [ 1.82510037e-02 -1.27867460e-02 -1.57095697e-... \n", - "614 [ 5.40032536e-02 -5.28502129e-02 -5.33268750e-... \n", - "1236 [-5.32836001e-03 -5.84292673e-02 -5.86670786e-... \n", - "1477 [ 3.02605387e-02 -4.37121317e-02 -2.70802993e-... \n", - "2261 [ 2.35723313e-02 -3.73509154e-02 -6.44604117e-... \n", - "2361 [ 1.04440488e-02 -9.37070698e-03 -7.36323372e-... \n", - "2378 [ 3.04989032e-02 -4.08191867e-02 -6.18648790e-... \n", - "3133 [ 0.00152804 -0.04189068 -0.04220504 -0.053740... \n", - "3140 [ 3.11435573e-02 -4.44000624e-02 -2.10917685e-... \n", - "3322 [ 2.75927987e-02 -6.23729872e-03 -3.83295454e-... \n", - "3583 [ 9.20385588e-03 -3.83387171e-02 -6.46291822e-... \n", - "4134 [-7.04960374e-04 -3.52595337e-02 -1.65264793e-... \n", - "4496 [ 3.67735326e-02 1.21120387e-03 -5.20942472e-... \n", - "5260 [ 2.07133405e-02 -1.69602726e-02 -5.07124476e-... \n", - "5400 [ 1.44114876e-02 -2.34710164e-02 -6.58538565e-... \n", - "5425 [ 3.10326386e-02 -2.19427086e-02 -6.56386837e-... \n", - "6014 [ 1.90773793e-02 -2.27493346e-02 -3.27166244e-... \n", - "8192 [ 0.01937891 -0.05466933 -0.06070872 -0.059028... \n", - "8240 [ 4.34123818e-03 -3.40953320e-02 -4.06381376e-... \n", - "8720 [ 0.03133732 -0.03972461 -0.00178199 -0.035876... \n", - "8914 [ 1.75969116e-02 -2.25022305e-02 -5.70390299e-... \n", - "10021 [ 5.02460636e-02 -5.25112189e-02 -4.12914790e-... \n", - "10327 [-0.00979626 -0.04912931 -0.08654705 -0.021063... \n", - "10345 [-0.04292191 -0.02636929 -0.06177032 -0.076520... \n", - "10369 [ 2.16020197e-02 -5.62509745e-02 -5.93873672e-... \n", + " text_embedding \\\n", + "782 [ 2.78223325e-02 -1.71949025e-02 -5.01214415e-... \n", + "795 [ 1.39164589e-02 -5.08313216e-02 -4.53360938e-... \n", + "861 [ 7.80681521e-03 -3.23560014e-02 -6.76454604e-... \n", + "1103 [ 4.18044440e-02 -4.28444222e-02 -6.26875088e-... \n", + "1241 [ 7.74183637e-03 -6.50701150e-02 -2.13856809e-... \n", + "1729 [ 3.15782428e-02 -1.23979878e-02 -3.70296165e-... \n", + "2167 [ 9.87241510e-03 -1.81103535e-02 -4.17162031e-... \n", + "2219 [ 2.56749280e-02 -4.92163002e-02 -5.67202382e-... \n", + "2392 [ 2.34611966e-02 -4.74611111e-02 -3.59710641e-... \n", + "2528 [ 1.90760177e-02 -4.90266569e-02 -5.60806654e-... \n", + "2737 [ 1.81887485e-02 -8.74284655e-03 -2.73009986e-... \n", + "2859 [ 3.52482982e-02 -3.30757573e-02 -4.48422395e-... \n", + "3439 [ 3.40348878e-03 -2.72301212e-02 -2.03482248e-... \n", + "3738 [ 0.01422119 -0.01114973 -0.04438976 -0.024421... \n", + "3805 [ 1.08179580e-02 -3.44337188e-02 -5.08812033e-... \n", + "3915 [-7.23852217e-03 -4.69538383e-02 -5.60489520e-... \n", + "3917 [-8.92711710e-03 -4.49132621e-02 -4.29662578e-... \n", + "4281 [-1.69487391e-02 -1.89835522e-02 -3.80971469e-... \n", + "4470 [ 1.28689921e-02 -3.25881056e-02 -6.53645024e-... \n", + "4915 [ 5.19403480e-02 -7.32436478e-02 -4.60561663e-... \n", + "4928 [-4.43694415e-03 -3.66776163e-04 -9.08496231e-... \n", + "5338 [ 2.19908613e-03 -3.93951498e-02 -6.52823672e-... \n", + "5582 [ 2.86326781e-02 -4.89189997e-02 -8.68150592e-... \n", + "6386 [ 3.33276950e-02 1.53224478e-02 -1.89354066e-... \n", + "6956 [ 1.47060463e-02 -3.36431377e-02 -6.56675845e-... \n", "\n", " statistics ml_embed_text_status \\\n", - "545 {\"token_count\":178,\"truncated\":false} \n", - "614 {\"token_count\":399,\"truncated\":false} \n", - "1236 {\"token_count\":129,\"truncated\":false} \n", - "1477 {\"token_count\":16,\"truncated\":false} \n", - "2261 {\"token_count\":33,\"truncated\":false} \n", - "2361 {\"token_count\":45,\"truncated\":false} \n", - "2378 {\"token_count\":892,\"truncated\":false} \n", - "3133 {\"token_count\":90,\"truncated\":false} \n", - "3140 {\"token_count\":372,\"truncated\":false} \n", - "3322 {\"token_count\":36,\"truncated\":false} \n", - "3583 {\"token_count\":52,\"truncated\":false} \n", - "4134 {\"token_count\":412,\"truncated\":false} \n", - "4496 {\"token_count\":182,\"truncated\":false} \n", - "5260 {\"token_count\":103,\"truncated\":false} \n", - "5400 {\"token_count\":60,\"truncated\":false} \n", - "5425 {\"token_count\":87,\"truncated\":false} \n", - "6014 {\"token_count\":175,\"truncated\":false} \n", - "8192 {\"token_count\":131,\"truncated\":false} \n", - "8240 {\"token_count\":87,\"truncated\":false} \n", - "8720 {\"token_count\":645,\"truncated\":false} \n", - "8914 {\"token_count\":180,\"truncated\":false} \n", - "10021 {\"token_count\":30,\"truncated\":false} \n", - "10327 {\"token_count\":194,\"truncated\":false} \n", - "10345 {\"token_count\":262,\"truncated\":false} \n", - "10369 {\"token_count\":77,\"truncated\":false} \n", + "782 {\"token_count\":121,\"truncated\":false} \n", + "795 {\"token_count\":141,\"truncated\":false} \n", + "861 {\"token_count\":160,\"truncated\":false} \n", + "1103 {\"token_count\":31,\"truncated\":false} \n", + "1241 {\"token_count\":23,\"truncated\":false} \n", + "1729 {\"token_count\":382,\"truncated\":false} \n", + "2167 {\"token_count\":556,\"truncated\":false} \n", + "2219 {\"token_count\":196,\"truncated\":false} \n", + "2392 {\"token_count\":641,\"truncated\":false} \n", + "2528 {\"token_count\":176,\"truncated\":false} \n", + "2737 {\"token_count\":230,\"truncated\":false} \n", + "2859 {\"token_count\":238,\"truncated\":false} \n", + "3439 {\"token_count\":197,\"truncated\":false} \n", + "3738 {\"token_count\":160,\"truncated\":false} \n", + "3805 {\"token_count\":477,\"truncated\":false} \n", + "3915 {\"token_count\":116,\"truncated\":false} \n", + "3917 {\"token_count\":71,\"truncated\":false} \n", + "4281 {\"token_count\":130,\"truncated\":false} \n", + "4470 {\"token_count\":200,\"truncated\":false} \n", + "4915 {\"token_count\":23,\"truncated\":false} \n", + "4928 {\"token_count\":83,\"truncated\":false} \n", + "5338 {\"token_count\":1279,\"truncated\":false} \n", + "5582 {\"token_count\":396,\"truncated\":false} \n", + "6386 {\"token_count\":79,\"truncated\":false} \n", + "6956 {\"token_count\":194,\"truncated\":false} \n", "\n", - " content \n", - "545 My payments have been approximately {$89.00} w... \n", - "614 Hi, I have contacted Trans Union XXXX XXXX abo... \n", - "1236 I have a XXXX XXXX XXXX credit card on my Exp... \n", - "1477 Wrongs information, selling my information to ... \n", - "2261 Please investigate and delete disputed item th... \n", - "2361 By the provisions of the Fair Credit Reporting... \n", - "2378 Since XX/XX/XXXX I have been trying to dispute... \n", - "3133 Out of the blue I received a debt collection n... \n", - "3140 My wife and I have been sending money to XXXX ... \n", - "3322 Phone calls from Convergent Outsourcing XXXX. ... \n", - "3583 I recently received a copy of my credit report... \n", - "4134 I have been sending the creditor what they hav... \n", - "4496 This is my second complaint. Their response to... \n", - "5260 XX/XX/XXXX and XX/XX/XXXX, {$3200.00} contacte... \n", - "5400 Upon checking my XXXX credit report I noticed ... \n", - "5425 Follow up to previous complaint XXXX XXXX XXXX... \n", - "6014 My new XXXX lease was over always paid on time... \n", - "8192 I have no idea where this account cane from. B... \n", - "8240 I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL F... \n", - "8720 XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consum... \n", - "8914 On XX/XX/21 I sent a letter regarding inaccura... \n", - "10021 XX/XX/XXXX and XX/XX/XXXX inaccurate informati... \n", - "10327 When I reviewed my credit report, I discovered... \n", - "10345 U.S. Bank sent two letters containing Visa Deb... \n", - "10369 I requested from XXXX that they reverse the la... \n", + " content \n", + "782 I 've sent multiple letters to this agency abo... \n", + "795 I receive social security XXXX funds in my XXX... \n", + "861 Hello, My name is XXXX XXXX XXXX. I have a pro... \n", + "1103 The debt occurred more than 7 years in the pas... \n", + "1241 UNAUTHORIZED CREDIT REPORTING NO ACCOUNT TO VA... \n", + "1729 XXXX on XXXX XX/XX/2021 I have Mr. Cooper mort... \n", + "2167 This is the third such complaint I have submit... \n", + "2219 Found and add online for a Prepaid Credit card... \n", + "2392 I am furnishing this complaint against Fed Loa... \n", + "2528 Despite multiple written requests, the unverif... \n", + "2737 After unsatisfying communication in the messag... \n", + "2859 Good Morning. My name is XXXX XXXX. My account... \n", + "3439 I have ongoing disputes that are preventing me... \n", + "3738 I had a loan with national Collegiate Trust. i... \n", + "3805 Hi I am submitting this XXXX XXXX this isn't a... \n", + "3915 portfolio is showin on my credit report with a... \n", + "3917 the company shared my information with another... \n", + "4281 I tried to submit a teacher loan forgiveness a... \n", + "4470 in accordance with the Fair Credit Reporting a... \n", + "4915 XXXX XXXX did not give me a receipt or a copy ... \n", + "4928 This company has filed a civil suit during a g... \n", + "5338 My credit report contains errors that is keepi... \n", + "5582 Coast Professional, XXXX, LA contacted me by m... \n", + "6386 Cares act refund requested in XXXX, called mul... \n", + "6956 n accordance with the Fair Credit Reporting ac... \n", "...\n", "\n", "[10000 rows x 4 columns]" @@ -1214,7 +1214,7 @@ { "data": { "text/html": [ - "Query job fa4bbc13-3831-4c80-9b59-9939e605ed58 is DONE. 61.7 MB processed. Open Job" + "Query job 46da96c8-c454-44d3-8b98-0e1bfeca69dd is DONE. 61.7 MB processed. Open Job" ], "text/plain": [ "" @@ -1226,7 +1226,7 @@ { "data": { "text/html": [ - "Query job d2d681aa-e49a-4fda-89fd-60cf906d3aec is DONE. 0 Bytes processed. Open Job" + "Query job dc6fe7cf-329d-4274-aff9-0b8dc2e56230 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1238,7 +1238,7 @@ { "data": { "text/html": [ - "Query job 234bb6be-625c-4c96-baea-c37c33410114 is DONE. 72.7 MB processed. Open Job" + "Query job 8c25a14a-af39-40a9-add5-de0f14bce9ce is DONE. 72.4 MB processed. Open Job" ], "text/plain": [ "" @@ -1250,7 +1250,7 @@ { "data": { "text/html": [ - "Query job 285817cb-99d3-426f-82c3-89d36119e8db is DONE. 80.0 kB processed. Open Job" + "Query job 0a6a45b2-7c35-4be8-91a3-391a5381553e is DONE. 80.0 kB processed. Open Job" ], "text/plain": [ "" @@ -1262,7 +1262,7 @@ { "data": { "text/html": [ - "Query job 3a39d2b0-55a1-4922-972a-8806b387f877 is DONE. 73.3 MB processed. Open Job" + "Query job b5e00edd-de21-40c1-bf61-9f1affdea318 is DONE. 73.1 MB processed. Open Job" ], "text/plain": [ "" @@ -1302,49 +1302,49 @@ " \n", " \n", " \n", - " 182250\n", + " 1094645\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.570560301900...\n", - " [ 4.70298417e-02 -4.08669300e-02 -2.99868709e-...\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.530282685572...\n", + " [ 7.32792774e-03 -7.59598315e-02 -4.49591577e-...\n", " {\"token_count\":10,\"truncated\":false}\n", " \n", - " These are not my accounts. Please remove them.\n", + " I do not have an account with this creditor\n", " \n", " \n", - " 3023485\n", + " 3372485\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.523572693768...\n", - " [ 1.55437263e-02 -1.93240177e-02 -2.48466972e-...\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.643931578310...\n", + " [-0.00161087 -0.04956109 -0.07371692 -0.057822...\n", " {\"token_count\":10,\"truncated\":false}\n", " \n", - " This debt is not mine due to identity theft.\n", + " Hard inquiries in my report that I do not reco...\n", " \n", " \n", - " 407254\n", + " 2669308\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.515173566816...\n", - " [-0.01293471 -0.01959546 -0.02238463 -0.066214...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.599709344244...\n", + " [ 5.50241247e-02 -1.50039541e-02 -2.08624080e-...\n", + " {\"token_count\":100,\"truncated\":false}\n", " \n", - " I do not owe this company money!!!!!\n", + " I purchase {$25.00} for stock on the cash app ...\n", " \n", " \n", - " 1509454\n", + " 133816\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.645342721754...\n", - " [ 3.21860723e-02 -2.67103072e-02 -4.78175096e-...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.618444281124...\n", + " [ 1.77251529e-02 -3.89547497e-02 -3.82236368e-...\n", + " {\"token_count\":100,\"truncated\":false}\n", " \n", - " VIOLATES HIPPA AND CRA\n", + " BBVA fees I am in The Texas snow storm where I...\n", " \n", " \n", - " 2357848\n", + " 2697156\n", " 1\n", - " [{'CENTROID_ID': 1, 'DISTANCE': 0.519872186251...\n", - " [-1.88122243e-02 -2.68064123e-02 -4.69480827e-...\n", - " {\"token_count\":10,\"truncated\":false}\n", + " [{'CENTROID_ID': 1, 'DISTANCE': 0.500398902102...\n", + " [-1.28429877e-02 -1.85956229e-02 -3.93197313e-...\n", + " {\"token_count\":1011,\"truncated\":false}\n", " \n", - " Receive numerous phone calls. I have no debt.\n", + " After paying on my student loan for years, I o...\n", " \n", " \n", "\n", @@ -1352,32 +1352,32 @@ ], "text/plain": [ " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", - "182250 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.570560301900... \n", - "3023485 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.523572693768... \n", - "407254 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.515173566816... \n", - "1509454 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.645342721754... \n", - "2357848 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.519872186251... \n", + "1094645 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.530282685572... \n", + "3372485 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.643931578310... \n", + "2669308 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.599709344244... \n", + "133816 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.618444281124... \n", + "2697156 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.500398902102... \n", "\n", " text_embedding \\\n", - "182250 [ 4.70298417e-02 -4.08669300e-02 -2.99868709e-... \n", - "3023485 [ 1.55437263e-02 -1.93240177e-02 -2.48466972e-... \n", - "407254 [-0.01293471 -0.01959546 -0.02238463 -0.066214... \n", - "1509454 [ 3.21860723e-02 -2.67103072e-02 -4.78175096e-... \n", - "2357848 [-1.88122243e-02 -2.68064123e-02 -4.69480827e-... \n", + "1094645 [ 7.32792774e-03 -7.59598315e-02 -4.49591577e-... \n", + "3372485 [-0.00161087 -0.04956109 -0.07371692 -0.057822... \n", + "2669308 [ 5.50241247e-02 -1.50039541e-02 -2.08624080e-... \n", + "133816 [ 1.77251529e-02 -3.89547497e-02 -3.82236368e-... \n", + "2697156 [-1.28429877e-02 -1.85956229e-02 -3.93197313e-... \n", "\n", - " statistics ml_embed_text_status \\\n", - "182250 {\"token_count\":10,\"truncated\":false} \n", - "3023485 {\"token_count\":10,\"truncated\":false} \n", - "407254 {\"token_count\":10,\"truncated\":false} \n", - "1509454 {\"token_count\":10,\"truncated\":false} \n", - "2357848 {\"token_count\":10,\"truncated\":false} \n", + " statistics ml_embed_text_status \\\n", + "1094645 {\"token_count\":10,\"truncated\":false} \n", + "3372485 {\"token_count\":10,\"truncated\":false} \n", + "2669308 {\"token_count\":100,\"truncated\":false} \n", + "133816 {\"token_count\":100,\"truncated\":false} \n", + "2697156 {\"token_count\":1011,\"truncated\":false} \n", "\n", - " content \n", - "182250 These are not my accounts. Please remove them. \n", - "3023485 This debt is not mine due to identity theft. \n", - "407254 I do not owe this company money!!!!! \n", - "1509454 VIOLATES HIPPA AND CRA \n", - "2357848 Receive numerous phone calls. I have no debt. " + " content \n", + "1094645 I do not have an account with this creditor \n", + "3372485 Hard inquiries in my report that I do not reco... \n", + "2669308 I purchase {$25.00} for stock on the cash app ... \n", + "133816 BBVA fees I am in The Texas snow storm where I... \n", + "2697156 After paying on my student loan for years, I o... " ] }, "execution_count": 13, @@ -1430,7 +1430,7 @@ { "data": { "text/html": [ - "Query job 85ead687-4ba9-44bf-88da-23a066f45960 is DONE. 10.7 MB processed. Open Job" + "Query job 8d4f24d6-dc37-47d3-8b4d-4505a55c4ccc is DONE. 10.4 MB processed. Open Job" ], "text/plain": [ "" @@ -1442,7 +1442,7 @@ { "data": { "text/html": [ - "Query job 68ef20cd-220d-40a9-bb42-63ed3d6f5d3f is DONE. 10.7 MB processed. Open Job" + "Query job c1f979ee-1f5d-4f37-8595-ee2167c06e63 is DONE. 10.4 MB processed. Open Job" ], "text/plain": [ "" @@ -1478,42 +1478,32 @@ "output_type": "stream", "text": [ "comment list 1:\n", - "1. Wrongs information, selling my information to third party. Incorrect reporting\n", - "2. I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL FROM XXXX XXXX XXXX XXXX WHICH ENDED A A LIE. THEY ALSO, PULLED MANY OTHERS I DID NT AGREED TO. SOLF PULLS ONLY\n", - "3. XX/XX/XXXX and XX/XX/XXXX inaccurate information reported 30 days late.\n", - "4. Im working on myCredit and I see a couple of inquiries that I have no idea where they came from.\n", - "5. I request a copy of all dispute results and documentary evidence from XXXX, and XXXX mailed to me\n", - "\n", - "comment list 2:\n", - "1. My wife and I have been sending money to XXXX via my brother-in-law to finish a building project we have been working on since XXXX with target date of completion by XX/XX/XXXX. In XXXX XXXX my brother-in-law in was contacted by his bank to confirm he was not defrauding my wife. My brother-in-law confirmed he was helping to handle the building project by organizing and paying the workers. In XXXX XXXX Bank of America reach out to my wife to update her profile to avoid account restrictions. My wife 's account was eventually restricted until she called and confirmed her employment and other personal information. My wife 's full account activities were then restored and we continued sending wire transfers to XXXX via her checking account. \n", - "Then I received a letter dated XXXX XXXX XXXX from Bank of America stating the money market account I share with my wife which has been opened since XXXX will be will be restricted from use in 21 days and closed in 30 days with no reason. I strongly believe this is a result of the legal international wires because there was no reason to close the Savings account which had with hardly any activity. \n", - "I agree that Bank of America has a right to close accounts but I do not agree with Bank of America closing accounts because of international transactions unless they can prove fraud, criminal activity or support for terrorism, this is discriminatory towards foreign nationals. How are foreign nationals suppose to make investments or support their family/community if they are excluded from the banking system?\n", - "2. XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consumer Financial Protection Bureau XXXX XXXX XXXX XXXX, IA XXXX Dear Sir or Madam : In XX/XX/XXXX Out of the blue JP Morgan Chase arbitrarily closed my account. This was after my mother is a XXXX survivor who is over XXXX years old and for whom I have a general power of attorney and take care of her bill paying was questioned about a transaction. She is also XXXX XXXX. \n", - "\n", - "I have reason to belief that a mentally disturbed family member for whom I have an order of protection initiated this situation. This individual has ben harassing me and other members of my family for a considerable amount of time. \n", - "\n", - "The bank initially was satisfied with her response. However within 2 days they closed the account of a XXXX year old XXXX XXXX person. \n", - "\n", - "Soon after for no reason my account was closed as well. I tried to reach out to the corporate offices of Chase and make great effort to find out what happened and to restore my account as well as my mothers but I was unsuccessful. In addition the people I spoke to were not only unhelpful bu exceedingly rude. \n", - "\n", - "I should add that I have had an account with Chase since XX/XX/XXXX and took care of my ailing father before he passed away for over 25 years as well. I am now taking care of my mother for over 28 years. \n", + "1. This is the third such complaint I have submitted regarding the same type of issue over the past 12-18 months. \n", "\n", - "I went so far as contacting a prominent Television reporter who was interested in doing a report on what happened. \n", + "On XX/XX/XXXX, my co-signed account was flagged by Navient as past due. The XXXX payment was mailed priority on XX/XX/XXXX and received by Navient on XX/XX/XXXX and delivered to \" an individual '' per the post office. \n", + "I called Navient on XX/XX/XXXX to talk to them about why my account was flagged since they received the payment long before the due date. The payment is sent via XXXX money orders under the same cover. The XXXX money order ( {$160.00} ) was cashed on XX/XX/XXXX per XXXX XXXX, the second money order ( {$250.00} ) which was sent in the same priority envelope and received the same time has not been cashed. \n", + "When I called the customer service agent at Navient she told me that my account was past due and wanted me to send another payment. When I explained that they had received the payment she argued with me that if they received it, the payment would have been cashed. I asked to speak with a supervisor. \n", + "I was connected with supervisor, XXXX XXXX, who asked that I send copies of the payments to him so he could submit for a missing payment request. I faxed the proof on XXXX @ XXXX with a receipt acknowledgment. \n", + "On XX/XX/XXXX, the payment was still not applied to the account. When I called XXXX XXXX, the money order was still not cashed. I called Navient again. Because of an argumentative customer service rep again, I requested to speak with a supervisor. I spoke with XXXX XXXX. She states that payment was not received. I explained the situation again. She said the missing payment request had not been submitted. She had me upload the documents so she could request a missing payment search. I have done everything I have been asked. \n", + "This issues continues to occur. For approximately 6 months at a time, Navient gets this right then there are issues again and again. I have submitted CFPB complaints about this in the past. \n", + "I was told it would take 5-7 business days to be resolved.\n", + "2. I tried to submit a teacher loan forgiveness application and they lost my application. I submitted the application again after talking to XXXX people at Nelnet. Then when I called back to check on the status they told me that I needed to submit another form for a different school that I worked at. I had already called previously and asked if I needed to submit any other papers and they told me \" no ''. Therefore, I have been paying my loan for 5 months and it should be forgiven. I am still paying my loan because I have to wait for them to approve the new forgiveness paperwork.\n", + "3. PayPal continues to overcharge their currency rate. It it always inflated in their favor so that they collect the difference.\n", + "4. My government feeds are not coming on to my card and I need the problem fix today\n", + "5. Paypal Credit 's website is confusing and does not accurately reflect all activity. When speaking with representatives like XXXX, it 's confusing to them and they can barely follow along with it. I am not receiving statements, which proves it difficult to determine the due dates on the accounts. The Reps are n't knowledgeable and the only thing they repeat to you is the amount due on the screen.\n", "\n", - "I have since managed to open an account at another bank but this week I had reason to go to a branch of Chase regarding another issue and a manager using my That is a very serious unsubstantiated accusation and given this information I have no choice but to submit this complaint. \n", - "\n", - "I have no interest in having an account again at a disreputable bank like Chase but I can not and will not accept or tolerate a derogatory accusation be associated with my name. \n", - "\n", - "I hope that my complaint will hAve the desired effect of removing this derogatory unsubstantiated accusation be removed from my name. However. I will not let this unfair matter stand and Chase ought to know that I have already retained an attorney and will if necessary hold Chase responsible and liable all damage i have incurred now And in the future Enclosed, please find the letter from Chase stating that they were closing my mothers account and a similar letter was received by me too. \n", - "\n", - "Also please find a letter from her Doctor stating that she is XXXX XXXX. \n", - "\n", - "Thank you. \n", + "comment list 2:\n", + "1. XXXX on XXXX XX/XX/2021 I have Mr. Cooper mortgage for years now. On XXXX XXXX XXXX I made an additional payment of $ XXXX towards my principal. More than 4 days - it's not reflected in the Amortization schedule ( Amortization schedule is not even visible ). Even after so many additional principal payments, Payoff calculator is way off and it still shows XXXX maturity date while it should start showing something like XX/XX/XXXX/XX/XX/XXXX as the initial date. There are lots of discrepancies on their website to reflect the balance and total. When called customer service on Friday and also chatted - i was assured of fixing this - but no fix till this point of time. Customer service there is a long wait. Auto bot doesn't let customers talk to the real person. \n", + "Finally after a lots of follow-up I got the amortization schedule via email but it is not reflecting another additional principal payment of {$4700.00} made on XX/XX/2021. \n", + "I did numerous chats and phone calls. Why i should depend on inefficinent humans to see my revised amortization schedule? \n", + "Why the online amortization schedule is not visible now? \n", "\n", - "XXXX XXXX\n", - "3. U.S. Bank sent two letters containing Visa Debit Cards to our address on XX/XX/2021. One Visa Debit Card is in the name of XXXX XXXX and one Visa Debit Card is in the name of XXXX XXXX. These cards supposedly link to existing checking accounts at U.S. Bank. However : ( 1 ) Neither of us have existing checking accounts at U.S. Bank, ( 2 ) Neither of us solicited a bank account at U.S. Bank, and ( 3 ) Neither of us solicited a Visa Debit Card. We have attempted to call U.S. Bank at the phone numbers provided in the letters but are only able to access an automated system which will not proceed without us establishing accounts and activating these cards. We are concerned here that one of two things has happened : either ( 1 ) we are victims of identity theft and some third party is trying to establish accounts in our name, or ( 2 ) U.S. Bank is engaged in bank fraud. In either case, we request the assistance of the Consumer Financial Protection Bureau. Thank you.\n", - "4. I contacted my bank over 3 times about this amount, the first two times I spoke to gentleman that agreed with me that I didnt get back a certain amount of dollars back, I did the math and they refuse to see that I do not owe this amount because I never had it in the first place. I wrote out all my charges and connected it to the charges made back from the consumer and I was missing XXXX, I called XXXX they said they gave it all back which is not their fault because they showed me proof. Along the lines Capital One does not want to take responsibility for the missing money. I have wrote everything out and then its not adding up, they keep saying that they did a charge back twice which is incorrect. My balance was at XXXX before I made this purchase and it shouldve been returned back to XXXX because I return all the items and nothing is in my possession. I have proof that I returned everything.\n", - "5. CB INDIGO ( Bank ) XX/XX/2022 I just recently got off the phone with the company and they wont put in a request of removal of a fraudulent hard inquiry from Insigo Mastercard to XXXX. They dont even have my information on file, I called 3 times most of them are lazy and was giving me a hard time.\n", + "Worst thing, after turning on escrow account, there is no transparency. Amount of escrow account is not in sync with all the sections of my online account. It seems that there are too many internal buggy systems in Mr. Cooper and everybody from different IT department is showing a different $ $ figure at various places. \n", + "Highly inefficient organization.\n", + "2. I had a loan with national Collegiate Trust. i did n't make any payments on the loan for 6 years and due to the \" statute to limitations '' the loan collector should not be contacting me any more, by they still are in both forms phone call 's and letters. I am also trying to buy a house for my family and found out that i can not get a USDA loan for the house because National Collegiate Trust has filled the loan as a government delinquent loan. The problem with that is the loan is a private loan not a federal/Government loan. due to the way they filled the lion on my credit report i am not able to buy my first home.\n", + "3. Cares act refund requested in XXXX, called multiple times given conflicting information every time. Its now XXXX and looking like its going to be even longer potentially months before the refund arrives. Blatant disregard for the cares act signed into law and terrible customer service. This company is ill suited to service the federal government contract it enjoys.\n", + "4. In an attempt to pay off my loan with Ocwen, I sent XXXX large payments to the institution on XXXX XXXX that would have decreased my loan amount to within a couple of hundred dollars ( you can not send XXXX payment that exceeds 80 % of the loan balance so I broke the payments up into XXXX XXXX. I scheduled the payments for the same day because I did not want any interest to accrue. After a few days, I noticed where the XXXX payments were withdrawn from my bank but that my Ocwen balance had not changed to reflect my XXXX payments. I called Ocwen on XXXX XXXX to ask about the payment. The Ocwen rep explained that because the XXXX payments were scheduled for the same day, that it exceeded the 80 % max limit. I asked Ocwen to return my XXXX payments. The rep said it would take XXXX business days. I called Ocwen back on XXXX XXXX when my funds had not been returned to my bank account. I gave them my account number and routing number to my bank, information that I am sure they already had since my monthly mortgage payments are debited from my account. They asked me to wait a couple more days. I called Ocwen back on XXXX XXXX. The rep asked me to be patient and assured me that the funds would be returned by XXXX XXXX. There were no funds returned to my account on XXXX XXXX. I called Ocwen again. I was asked to wait 40+ minutes on hold while the Ocwen rep put me in touch with an escalation specialist ( ES ). The ES told me that my funds would be reurned within XXXX hours and that he had put it in as a priority because I had called so often. There were no funds on XXXX XXXX. I called Ocwen again to see if there was a problem. There was. After speaking to a rep and another ES, I was told that my funds could not be returned to me! The ES said that he did not see my funds! He claimed to put me in touch with someone who XXXX be able to address my concerns. So that 's where I am now, on hold waiting to speak to yet another person! This is a significant amount of money and I fear that Ocwen is trying to get away with keeping my XXXX payments!\n", + "5. In XX/XX/XXXX we received our first bill from XXXX XXXX for XXXX. ( attached ) We promptly paid the bill. Again, in XX/XX/XXXX we paid our second bill for XXXX. Again, both on time. Then when XX/XX/XXXX statement came we were billed XXXX. My husband called XXXX XXXX to find out what the issue was. We were told there was a loan shortfall caused by us paying XXXX ( the amount we were billed ) and that our loan was re-amoritized. I argued I had a fixed rate loan, had never missed a payment, had never made a late payment, and paid exactly what was billed. Well, after double checking my promissory note ( attached ) and TILA ( attached ) I was to always be billed XXXX. XXXX changed the monthly payment and thus caused a shortfall. When I told them this information they refused to correct the shortfall. Not only did they not correct the mistake they refused to return my calls or emails to provide answers for these issues. Around 90 days later and this issue still exists and they still refuse to answer. Additionally I offered to make up the shortfall myself by offering a check for the missing amount and they just applied it to interest. Thus the shortfall still exists. The extra amount would have gone directly to principal. Additionally, in XX/XX/XXXX we made an extra payment amount on top of the monthly payment. This was made all at the same time. The letter we sent contained directions to only apply extra payments beyond the monthly requirement be applied to principal and not the next months payment. This was ignored. Then XXXX \" a higher up '' as she calls herself lied and told me it went to principal when clearly it did not. We requested this be fixed and it has not been done. No one has offered to fix anything either. A certified letter is attached that I mailed. I also made dozens of calls.\n", "\n" ] } @@ -1547,41 +1537,31 @@ "text": [ "Please highlight the most obvious difference between the two lists of comments:\n", "comment list 1:\n", - "1. Wrongs information, selling my information to third party. Incorrect reporting\n", - "2. I TIED TO BUY CAR AT XXXX, THEY GOT APPROVAL FROM XXXX XXXX XXXX XXXX WHICH ENDED A A LIE. THEY ALSO, PULLED MANY OTHERS I DID NT AGREED TO. SOLF PULLS ONLY\n", - "3. XX/XX/XXXX and XX/XX/XXXX inaccurate information reported 30 days late.\n", - "4. Im working on myCredit and I see a couple of inquiries that I have no idea where they came from.\n", - "5. I request a copy of all dispute results and documentary evidence from XXXX, and XXXX mailed to me\n", - "comment list 2:\n", - "1. My wife and I have been sending money to XXXX via my brother-in-law to finish a building project we have been working on since XXXX with target date of completion by XX/XX/XXXX. In XXXX XXXX my brother-in-law in was contacted by his bank to confirm he was not defrauding my wife. My brother-in-law confirmed he was helping to handle the building project by organizing and paying the workers. In XXXX XXXX Bank of America reach out to my wife to update her profile to avoid account restrictions. My wife 's account was eventually restricted until she called and confirmed her employment and other personal information. My wife 's full account activities were then restored and we continued sending wire transfers to XXXX via her checking account. \n", - "Then I received a letter dated XXXX XXXX XXXX from Bank of America stating the money market account I share with my wife which has been opened since XXXX will be will be restricted from use in 21 days and closed in 30 days with no reason. I strongly believe this is a result of the legal international wires because there was no reason to close the Savings account which had with hardly any activity. \n", - "I agree that Bank of America has a right to close accounts but I do not agree with Bank of America closing accounts because of international transactions unless they can prove fraud, criminal activity or support for terrorism, this is discriminatory towards foreign nationals. How are foreign nationals suppose to make investments or support their family/community if they are excluded from the banking system?\n", - "2. XXXX XXXX XXXX XXXX, NY XXXX XX/XX/XXXX Consumer Financial Protection Bureau XXXX XXXX XXXX XXXX, IA XXXX Dear Sir or Madam : In XX/XX/XXXX Out of the blue JP Morgan Chase arbitrarily closed my account. This was after my mother is a XXXX survivor who is over XXXX years old and for whom I have a general power of attorney and take care of her bill paying was questioned about a transaction. She is also XXXX XXXX. \n", - "\n", - "I have reason to belief that a mentally disturbed family member for whom I have an order of protection initiated this situation. This individual has ben harassing me and other members of my family for a considerable amount of time. \n", - "\n", - "The bank initially was satisfied with her response. However within 2 days they closed the account of a XXXX year old XXXX XXXX person. \n", + "1. This is the third such complaint I have submitted regarding the same type of issue over the past 12-18 months. \n", "\n", - "Soon after for no reason my account was closed as well. I tried to reach out to the corporate offices of Chase and make great effort to find out what happened and to restore my account as well as my mothers but I was unsuccessful. In addition the people I spoke to were not only unhelpful bu exceedingly rude. \n", - "\n", - "I should add that I have had an account with Chase since XX/XX/XXXX and took care of my ailing father before he passed away for over 25 years as well. I am now taking care of my mother for over 28 years. \n", - "\n", - "I went so far as contacting a prominent Television reporter who was interested in doing a report on what happened. \n", - "\n", - "I have since managed to open an account at another bank but this week I had reason to go to a branch of Chase regarding another issue and a manager using my That is a very serious unsubstantiated accusation and given this information I have no choice but to submit this complaint. \n", - "\n", - "I have no interest in having an account again at a disreputable bank like Chase but I can not and will not accept or tolerate a derogatory accusation be associated with my name. \n", - "\n", - "I hope that my complaint will hAve the desired effect of removing this derogatory unsubstantiated accusation be removed from my name. However. I will not let this unfair matter stand and Chase ought to know that I have already retained an attorney and will if necessary hold Chase responsible and liable all damage i have incurred now And in the future Enclosed, please find the letter from Chase stating that they were closing my mothers account and a similar letter was received by me too. \n", - "\n", - "Also please find a letter from her Doctor stating that she is XXXX XXXX. \n", - "\n", - "Thank you. \n", + "On XX/XX/XXXX, my co-signed account was flagged by Navient as past due. The XXXX payment was mailed priority on XX/XX/XXXX and received by Navient on XX/XX/XXXX and delivered to \" an individual '' per the post office. \n", + "I called Navient on XX/XX/XXXX to talk to them about why my account was flagged since they received the payment long before the due date. The payment is sent via XXXX money orders under the same cover. The XXXX money order ( {$160.00} ) was cashed on XX/XX/XXXX per XXXX XXXX, the second money order ( {$250.00} ) which was sent in the same priority envelope and received the same time has not been cashed. \n", + "When I called the customer service agent at Navient she told me that my account was past due and wanted me to send another payment. When I explained that they had received the payment she argued with me that if they received it, the payment would have been cashed. I asked to speak with a supervisor. \n", + "I was connected with supervisor, XXXX XXXX, who asked that I send copies of the payments to him so he could submit for a missing payment request. I faxed the proof on XXXX @ XXXX with a receipt acknowledgment. \n", + "On XX/XX/XXXX, the payment was still not applied to the account. When I called XXXX XXXX, the money order was still not cashed. I called Navient again. Because of an argumentative customer service rep again, I requested to speak with a supervisor. I spoke with XXXX XXXX. She states that payment was not received. I explained the situation again. She said the missing payment request had not been submitted. She had me upload the documents so she could request a missing payment search. I have done everything I have been asked. \n", + "This issues continues to occur. For approximately 6 months at a time, Navient gets this right then there are issues again and again. I have submitted CFPB complaints about this in the past. \n", + "I was told it would take 5-7 business days to be resolved.\n", + "2. I tried to submit a teacher loan forgiveness application and they lost my application. I submitted the application again after talking to XXXX people at Nelnet. Then when I called back to check on the status they told me that I needed to submit another form for a different school that I worked at. I had already called previously and asked if I needed to submit any other papers and they told me \" no ''. Therefore, I have been paying my loan for 5 months and it should be forgiven. I am still paying my loan because I have to wait for them to approve the new forgiveness paperwork.\n", + "3. PayPal continues to overcharge their currency rate. It it always inflated in their favor so that they collect the difference.\n", + "4. My government feeds are not coming on to my card and I need the problem fix today\n", + "5. Paypal Credit 's website is confusing and does not accurately reflect all activity. When speaking with representatives like XXXX, it 's confusing to them and they can barely follow along with it. I am not receiving statements, which proves it difficult to determine the due dates on the accounts. The Reps are n't knowledgeable and the only thing they repeat to you is the amount due on the screen.\n", + "comment list 2:\n", + "1. XXXX on XXXX XX/XX/2021 I have Mr. Cooper mortgage for years now. On XXXX XXXX XXXX I made an additional payment of $ XXXX towards my principal. More than 4 days - it's not reflected in the Amortization schedule ( Amortization schedule is not even visible ). Even after so many additional principal payments, Payoff calculator is way off and it still shows XXXX maturity date while it should start showing something like XX/XX/XXXX/XX/XX/XXXX as the initial date. There are lots of discrepancies on their website to reflect the balance and total. When called customer service on Friday and also chatted - i was assured of fixing this - but no fix till this point of time. Customer service there is a long wait. Auto bot doesn't let customers talk to the real person. \n", + "Finally after a lots of follow-up I got the amortization schedule via email but it is not reflecting another additional principal payment of {$4700.00} made on XX/XX/2021. \n", + "I did numerous chats and phone calls. Why i should depend on inefficinent humans to see my revised amortization schedule? \n", + "Why the online amortization schedule is not visible now? \n", "\n", - "XXXX XXXX\n", - "3. U.S. Bank sent two letters containing Visa Debit Cards to our address on XX/XX/2021. One Visa Debit Card is in the name of XXXX XXXX and one Visa Debit Card is in the name of XXXX XXXX. These cards supposedly link to existing checking accounts at U.S. Bank. However : ( 1 ) Neither of us have existing checking accounts at U.S. Bank, ( 2 ) Neither of us solicited a bank account at U.S. Bank, and ( 3 ) Neither of us solicited a Visa Debit Card. We have attempted to call U.S. Bank at the phone numbers provided in the letters but are only able to access an automated system which will not proceed without us establishing accounts and activating these cards. We are concerned here that one of two things has happened : either ( 1 ) we are victims of identity theft and some third party is trying to establish accounts in our name, or ( 2 ) U.S. Bank is engaged in bank fraud. In either case, we request the assistance of the Consumer Financial Protection Bureau. Thank you.\n", - "4. I contacted my bank over 3 times about this amount, the first two times I spoke to gentleman that agreed with me that I didnt get back a certain amount of dollars back, I did the math and they refuse to see that I do not owe this amount because I never had it in the first place. I wrote out all my charges and connected it to the charges made back from the consumer and I was missing XXXX, I called XXXX they said they gave it all back which is not their fault because they showed me proof. Along the lines Capital One does not want to take responsibility for the missing money. I have wrote everything out and then its not adding up, they keep saying that they did a charge back twice which is incorrect. My balance was at XXXX before I made this purchase and it shouldve been returned back to XXXX because I return all the items and nothing is in my possession. I have proof that I returned everything.\n", - "5. CB INDIGO ( Bank ) XX/XX/2022 I just recently got off the phone with the company and they wont put in a request of removal of a fraudulent hard inquiry from Insigo Mastercard to XXXX. They dont even have my information on file, I called 3 times most of them are lazy and was giving me a hard time.\n", + "Worst thing, after turning on escrow account, there is no transparency. Amount of escrow account is not in sync with all the sections of my online account. It seems that there are too many internal buggy systems in Mr. Cooper and everybody from different IT department is showing a different $ $ figure at various places. \n", + "Highly inefficient organization.\n", + "2. I had a loan with national Collegiate Trust. i did n't make any payments on the loan for 6 years and due to the \" statute to limitations '' the loan collector should not be contacting me any more, by they still are in both forms phone call 's and letters. I am also trying to buy a house for my family and found out that i can not get a USDA loan for the house because National Collegiate Trust has filled the loan as a government delinquent loan. The problem with that is the loan is a private loan not a federal/Government loan. due to the way they filled the lion on my credit report i am not able to buy my first home.\n", + "3. Cares act refund requested in XXXX, called multiple times given conflicting information every time. Its now XXXX and looking like its going to be even longer potentially months before the refund arrives. Blatant disregard for the cares act signed into law and terrible customer service. This company is ill suited to service the federal government contract it enjoys.\n", + "4. In an attempt to pay off my loan with Ocwen, I sent XXXX large payments to the institution on XXXX XXXX that would have decreased my loan amount to within a couple of hundred dollars ( you can not send XXXX payment that exceeds 80 % of the loan balance so I broke the payments up into XXXX XXXX. I scheduled the payments for the same day because I did not want any interest to accrue. After a few days, I noticed where the XXXX payments were withdrawn from my bank but that my Ocwen balance had not changed to reflect my XXXX payments. I called Ocwen on XXXX XXXX to ask about the payment. The Ocwen rep explained that because the XXXX payments were scheduled for the same day, that it exceeded the 80 % max limit. I asked Ocwen to return my XXXX payments. The rep said it would take XXXX business days. I called Ocwen back on XXXX XXXX when my funds had not been returned to my bank account. I gave them my account number and routing number to my bank, information that I am sure they already had since my monthly mortgage payments are debited from my account. They asked me to wait a couple more days. I called Ocwen back on XXXX XXXX. The rep asked me to be patient and assured me that the funds would be returned by XXXX XXXX. There were no funds returned to my account on XXXX XXXX. I called Ocwen again. I was asked to wait 40+ minutes on hold while the Ocwen rep put me in touch with an escalation specialist ( ES ). The ES told me that my funds would be reurned within XXXX hours and that he had put it in as a priority because I had called so often. There were no funds on XXXX XXXX. I called Ocwen again to see if there was a problem. There was. After speaking to a rep and another ES, I was told that my funds could not be returned to me! The ES said that he did not see my funds! He claimed to put me in touch with someone who XXXX be able to address my concerns. So that 's where I am now, on hold waiting to speak to yet another person! This is a significant amount of money and I fear that Ocwen is trying to get away with keeping my XXXX payments!\n", + "5. In XX/XX/XXXX we received our first bill from XXXX XXXX for XXXX. ( attached ) We promptly paid the bill. Again, in XX/XX/XXXX we paid our second bill for XXXX. Again, both on time. Then when XX/XX/XXXX statement came we were billed XXXX. My husband called XXXX XXXX to find out what the issue was. We were told there was a loan shortfall caused by us paying XXXX ( the amount we were billed ) and that our loan was re-amoritized. I argued I had a fixed rate loan, had never missed a payment, had never made a late payment, and paid exactly what was billed. Well, after double checking my promissory note ( attached ) and TILA ( attached ) I was to always be billed XXXX. XXXX changed the monthly payment and thus caused a shortfall. When I told them this information they refused to correct the shortfall. Not only did they not correct the mistake they refused to return my calls or emails to provide answers for these issues. Around 90 days later and this issue still exists and they still refuse to answer. Additionally I offered to make up the shortfall myself by offering a check for the missing amount and they just applied it to interest. Thus the shortfall still exists. The extra amount would have gone directly to principal. Additionally, in XX/XX/XXXX we made an extra payment amount on top of the monthly payment. This was made all at the same time. The letter we sent contained directions to only apply extra payments beyond the monthly requirement be applied to principal and not the next months payment. This was ignored. Then XXXX \" a higher up '' as she calls herself lied and told me it went to principal when clearly it did not. We requested this be fixed and it has not been done. No one has offered to fix anything either. A certified letter is attached that I mailed. I also made dozens of calls.\n", "\n" ] } @@ -1613,7 +1593,7 @@ { "data": { "text/html": [ - "Query job a7ce86a7-3a18-47b9-a46f-98dbe6a5a339 is DONE. 0 Bytes processed. Open Job" + "Query job de5da6c9-96b5-42a1-b199-42687392fe37 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1651,7 +1631,7 @@ { "data": { "text/html": [ - "Query job d568c03d-6bbd-4c3e-b087-563b7f5135ed is DONE. 0 Bytes processed. Open Job" + "Query job 1363c327-00b5-4835-a902-da84882bc996 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1663,7 +1643,7 @@ { "data": { "text/html": [ - "Query job 17eaa806-51a4-4ee9-b219-75455d0095a7 is DONE. 8 Bytes processed. Open Job" + "Query job c5996f1e-a140-4e7d-8775-091e1a73d882 is DONE. 8 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1675,7 +1655,7 @@ { "data": { "text/html": [ - "Query job e6d40ded-691d-4523-94ea-dd8202bd0220 is DONE. 2 Bytes processed. Open Job" + "Query job db1de3ab-2e6e-4b3f-8e6a-01bad33ac45f is DONE. 2 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1687,7 +1667,7 @@ { "data": { "text/html": [ - "Query job 200f0b88-7b6d-417b-a181-a98138e3bc95 is DONE. 193 Bytes processed. Open Job" + "Query job 38d9a9d0-7f03-4091-858b-f864da30987e is DONE. 375 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1699,7 +1679,7 @@ { "data": { "text/plain": [ - "'The most obvious difference between the two lists of comments is that list 1 is related to credit reporting disputes and list 2 is a collection of general consumer banking complaints.'" + "'The most obvious difference between the two lists of comments is the subject matter. The first list of comments is primarily focused on issues with financial institutions, such as Navient, Nelnet, PayPal, and Mr. Cooper. The second list of comments is primarily focused on issues with government agencies, such as the National Collegiate Trust, the USDA, and Ocwen.'" ] }, "execution_count": 19, @@ -1753,7 +1733,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb index b3c965aded..e7b69f017b 100644 --- a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb +++ b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb @@ -1051,7 +1051,7 @@ "source": [ "## Estimators\n", "\n", - "Following Scikit-Learn, all learning components are \"estimators\"; objects that can learn from training data and then apply themselves to new data. Estimators share the following patterns:\n", + "Following scikit-learn, all learning components are \"estimators\"; objects that can learn from training data and then apply themselves to new data. Estimators share the following patterns:\n", "\n", "- a constructor that takes a list of parameters\n", "- a standard string representation that shows the class name and all non-default parameters, e.g. `LinearRegression(fit_intercept=False)`\n", diff --git a/notebooks/regression/sklearn_linear_regression.ipynb b/notebooks/regression/sklearn_linear_regression.ipynb index ec14d15cdf..2873527449 100644 --- a/notebooks/regression/sklearn_linear_regression.ipynb +++ b/notebooks/regression/sklearn_linear_regression.ipynb @@ -7,7 +7,7 @@ "source": [ "# Using ML - SKLearn linear regression\n", "\n", - "This demo shows how we can implement a linear regression in BigQuery DataFrames ML, with API that is exactly compatible with Scikit-Learn." + "This demo shows how we can implement a linear regression in BigQuery DataFrames ML, with API that is exactly compatible with scikit-learn." ] }, { diff --git a/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb deleted file mode 100644 index 598d958f0c..0000000000 --- a/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb +++ /dev/null @@ -1,723 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2023 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Train a pytorch model with Vertex AI SDK 2.0 and Bigframes\n", - "\n", - "\n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"VertexOpen in Vertex AI Workbench\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "This tutorial demonstrates how to train a pytorch model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", - "\n", - "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d975e698c9a4" - }, - "source": [ - "### Objective\n", - "\n", - "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", - "\n", - "\n", - "This tutorial uses the following Google Cloud ML services:\n", - "\n", - "- `Vertex AI Training`\n", - "- `Vertex AI Remote Training`\n", - "\n", - "\n", - "The steps performed include:\n", - "\n", - "- Initialize a dataframe from a BigQuery table and split the dataset\n", - "- Perform transformations as a Vertex AI remote training.\n", - "- Train the model remotely and evaluate the model locally\n", - "\n", - "**Local-to-remote training**\n", - "\n", - "```\n", - "import vertexai\n", - "from my_module import MyModelClass\n", - "\n", - "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", - "\n", - "# Wrap the model class with `vertex_ai.preview.remote`\n", - "MyModelClass = vertexai.preview.remote(MyModelClass)\n", - "\n", - "# Instantiate the class\n", - "model = MyModelClass(...)\n", - "\n", - "# Optional set remote config\n", - "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", - "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", - "\n", - "# This `fit` call will be executed remotely\n", - "model.fit(...)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "08d289fa873f" - }, - "source": [ - "### Dataset\n", - "\n", - "This tutorial uses the IRIS dataset, which predicts the iris species." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aed92deeb4a0" - }, - "source": [ - "### Costs\n", - "\n", - "This tutorial uses billable components of Google Cloud:\n", - "\n", - "* Vertex AI\n", - "* BigQuery\n", - "* Cloud Storage\n", - "\n", - "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", - "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", - "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", - "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", - "to generate a cost estimate based on your projected usage." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7EUnXsZhAGF" - }, - "source": [ - "## Installation\n", - "\n", - "Install the following packages required to execute this notebook. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2b4ef9b72d43" - }, - "outputs": [], - "source": [ - "# Install the packages\n", - "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", - "! pip3 install --upgrade --quiet bigframes\n", - "! pip3 install --upgrade --quiet torch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "58707a750154" - }, - "source": [ - "### Colab only: Uncomment the following cell to restart the kernel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f200f10a1da3" - }, - "outputs": [], - "source": [ - "# Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BF1j6f9HApxa" - }, - "source": [ - "## Before you begin\n", - "\n", - "### Set up your Google Cloud project\n", - "\n", - "**The following steps are required, regardless of your notebook environment.**\n", - "\n", - "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", - "\n", - "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", - "\n", - "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WReHDGG5g0XY" - }, - "source": [ - "#### Set your project ID\n", - "\n", - "**If you don't know your project ID**, try the following:\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oM1iC_MfAts1" - }, - "outputs": [], - "source": [ - "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "region" - }, - "source": [ - "#### Region\n", - "\n", - "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "region" - }, - "outputs": [], - "source": [ - "REGION = \"us-central1\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sBCra4QMA2wR" - }, - "source": [ - "### Authenticate your Google Cloud account\n", - "\n", - "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "74ccc9e52986" - }, - "source": [ - "**1. Vertex AI Workbench**\n", - "* Do nothing as you are already authenticated." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "de775a3773ba" - }, - "source": [ - "**2. Local JupyterLab instance, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "254614fa0c46" - }, - "outputs": [], - "source": [ - "# ! gcloud auth login" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ef21552ccea8" - }, - "source": [ - "**3. Colab, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "603adbbf0532" - }, - "outputs": [], - "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f6b2ccc891ed" - }, - "source": [ - "**4. Service account or other**\n", - "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zgPO1eR3CYjk" - }, - "source": [ - "### Create a Cloud Storage bucket\n", - "\n", - "Create a storage bucket to store intermediate artifacts such as datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MzGDU7TWdts_" - }, - "outputs": [], - "source": [ - "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-EcIXiGsCePi" - }, - "source": [ - "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NIq7R4HZCfIc" - }, - "outputs": [], - "source": [ - "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "960505627ddf" - }, - "source": [ - "### Import libraries and define constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PyQmSRbKA8r-" - }, - "outputs": [], - "source": [ - "import bigframes.pandas as bf\n", - "import torch\n", - "import vertexai\n", - "from vertexai.preview import VertexModel\n", - "\n", - "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", - "bf.options.bigquery.project = PROJECT_ID\n", - "\n", - "from bigframes.ml.model_selection import \\\n", - " train_test_split as bf_train_test_split" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "source": [ - "## Initialize Vertex AI SDK for Python\n", - "\n", - "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "outputs": [], - "source": [ - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=REGION,\n", - " staging_bucket=BUCKET_URI,\n", - ")\n", - "\n", - "REMOTE_JOB_NAME = \"sdk2-bigframes-pytorch\"\n", - "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "105334524e96" - }, - "source": [ - "## Prepare the dataset\n", - "\n", - "Now load the Iris dataset and split the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b44cdc4e03f1" - }, - "outputs": [], - "source": [ - "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", - "\n", - "species_categories = {\n", - " \"versicolor\": 0,\n", - " \"virginica\": 1,\n", - " \"setosa\": 2,\n", - "}\n", - "df[\"species\"] = df[\"species\"].map(species_categories)\n", - "\n", - "# Assign an index column name\n", - "index_col = \"index\"\n", - "df.index.name = index_col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9cb8616b1997" - }, - "outputs": [], - "source": [ - "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n", - "label_columns = df[[\"species\"]]\n", - "train_X, test_X, train_y, test_y = bf_train_test_split(\n", - " feature_columns, label_columns, test_size=0.2\n", - ")\n", - "\n", - "print(\"X_train size: \", train_X.size)\n", - "print(\"X_test size: \", test_X.size)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "23fe7b734b08" - }, - "outputs": [], - "source": [ - "# Switch to remote mode for training\n", - "vertexai.preview.init(remote=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5904a0f1bb03" - }, - "source": [ - "## PyTorch remote training with CPU (Custom PyTorch model)\n", - "\n", - "First, train a PyTorch model as a remote training job:\n", - "\n", - "- Reinitialize Vertex AI for remote training.\n", - "- Set TorchLogisticRegression for the remote training job.\n", - "- Invoke TorchLogisticRegression locally which will launch the remote training job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2a1b85195a17" - }, - "outputs": [], - "source": [ - "# define the custom model\n", - "class TorchLogisticRegression(VertexModel, torch.nn.Module):\n", - " def __init__(self, input_size: int, output_size: int):\n", - " torch.nn.Module.__init__(self)\n", - " VertexModel.__init__(self)\n", - " self.linear = torch.nn.Linear(input_size, output_size)\n", - " self.softmax = torch.nn.Softmax(dim=1)\n", - "\n", - " def forward(self, x):\n", - " return self.softmax(self.linear(x))\n", - "\n", - " @vertexai.preview.developer.mark.train()\n", - " def train(self, X, y, num_epochs, lr):\n", - " X = X.to(torch.float32)\n", - " y = torch.flatten(y) # necessary to get 1D tensor\n", - " dataloader = torch.utils.data.DataLoader(\n", - " torch.utils.data.TensorDataset(X, y),\n", - " batch_size=10,\n", - " shuffle=True,\n", - " generator=torch.Generator(device=X.device),\n", - " )\n", - "\n", - " criterion = torch.nn.CrossEntropyLoss()\n", - " optimizer = torch.optim.SGD(self.parameters(), lr=lr)\n", - "\n", - " for t in range(num_epochs):\n", - " for batch, (X, y) in enumerate(dataloader):\n", - " optimizer.zero_grad()\n", - " pred = self(X)\n", - " loss = criterion(pred, y)\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " @vertexai.preview.developer.mark.predict()\n", - " def predict(self, X):\n", - " X = torch.tensor(X).to(torch.float32)\n", - " with torch.no_grad():\n", - " pred = torch.argmax(self(X), dim=1)\n", - " return pred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4e35593f520a" - }, - "outputs": [], - "source": [ - "# Switch to remote mode for training\n", - "vertexai.preview.init(remote=True)\n", - "\n", - "# Instantiate model\n", - "model = TorchLogisticRegression(4, 3)\n", - "\n", - "# Set training config\n", - "model.train.vertex.remote_config.custom_commands = [\n", - " \"pip install torchdata\",\n", - " \"pip install torcharrow\",\n", - "]\n", - "model.train.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-model\"\n", - "model.train.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "# Train model on Vertex\n", - "model.train(train_X, train_y, num_epochs=200, lr=0.05)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "edf4d0708f02" - }, - "source": [ - "## Remote prediction\n", - "\n", - "Obtain predictions from the trained model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "42dfbff0ca15" - }, - "outputs": [], - "source": [ - "vertexai.preview.init(remote=True)\n", - "\n", - "# Set remote config\n", - "model.predict.vertex.remote_config.custom_commands = [\n", - " \"pip install torchdata\",\n", - " \"pip install torcharrow\",\n", - "]\n", - "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-predict\"\n", - "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "predictions = model.predict(test_X)\n", - "\n", - "print(f\"Remote predictions: {predictions}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4340ed8316cd" - }, - "source": [ - "## Local evaluation\n", - "\n", - "Evaluate model results locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eb27a31cec6f" - }, - "outputs": [], - "source": [ - "# User must convert bigframes to torch tensor for local evaluation\n", - "train_X_tensor = torch.from_numpy(\n", - " train_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", - ")\n", - "train_y_tensor = torch.from_numpy(\n", - " train_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", - ")\n", - "\n", - "test_X_tensor = torch.from_numpy(\n", - " test_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", - ")\n", - "test_y_tensor = torch.from_numpy(\n", - " test_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7db44ad81389" - }, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score\n", - "\n", - "# Switch to local mode for evaluation\n", - "vertexai.preview.init(remote=False)\n", - "\n", - "# Evaluate model's accuracy score\n", - "print(\n", - " f\"Train accuracy: {accuracy_score(train_y_tensor, model.predict(train_X_tensor))}\"\n", - ")\n", - "\n", - "print(f\"Test accuracy: {accuracy_score(test_y_tensor, model.predict(test_X_tensor))}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TpV-iwP9qw9c" - }, - "source": [ - "## Cleaning up\n", - "\n", - "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", - "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", - "\n", - "Otherwise, you can delete the individual resources you created in this tutorial:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sx_vKniMq9ZX" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# Delete Cloud Storage objects that were created\n", - "delete_bucket = False\n", - "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", - " ! gsutil -m rm -r $BUCKET_URI" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "sdk2_bigframes_pytorch.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb deleted file mode 100644 index 021c070753..0000000000 --- a/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb +++ /dev/null @@ -1,727 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2023 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Train a scikit-learn model with Vertex AI SDK 2.0 and Bigframes\n", - "\n", - "\n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"VertexOpen in Vertex AI Workbench\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "This tutorial demonstrates how to train a scikit-learn model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", - "\n", - "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d975e698c9a4" - }, - "source": [ - "### Objective\n", - "\n", - "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", - "\n", - "\n", - "This tutorial uses the following Google Cloud ML services:\n", - "\n", - "- `Vertex AI Training`\n", - "- `Vertex AI Remote Training`\n", - "\n", - "\n", - "The steps performed include:\n", - "\n", - "- Initialize a dataframe from a BigQuery table and split the dataset\n", - "- Perform transformations as a Vertex AI remote training.\n", - "- Train the model remotely and evaluate the model locally\n", - "\n", - "**Local-to-remote training**\n", - "\n", - "```\n", - "import vertexai\n", - "from my_module import MyModelClass\n", - "\n", - "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", - "\n", - "# Wrap the model class with `vertex_ai.preview.remote`\n", - "MyModelClass = vertexai.preview.remote(MyModelClass)\n", - "\n", - "# Instantiate the class\n", - "model = MyModelClass(...)\n", - "\n", - "# Optional set remote config\n", - "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", - "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", - "\n", - "# This `fit` call will be executed remotely\n", - "model.fit(...)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "08d289fa873f" - }, - "source": [ - "### Dataset\n", - "\n", - "This tutorial uses the IRIS dataset, which predicts the iris species." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aed92deeb4a0" - }, - "source": [ - "### Costs\n", - "\n", - "This tutorial uses billable components of Google Cloud:\n", - "\n", - "* Vertex AI\n", - "* BigQuery\n", - "* Cloud Storage\n", - "\n", - "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", - "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", - "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", - "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", - "to generate a cost estimate based on your projected usage." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7EUnXsZhAGF" - }, - "source": [ - "## Installation\n", - "\n", - "Install the following packages required to execute this notebook. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2b4ef9b72d43" - }, - "outputs": [], - "source": [ - "# Install the packages\n", - "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", - "! pip3 install --upgrade --quiet bigframes" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "58707a750154" - }, - "source": [ - "### Colab only: Uncomment the following cell to restart the kernel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f200f10a1da3" - }, - "outputs": [], - "source": [ - "# Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BF1j6f9HApxa" - }, - "source": [ - "## Before you begin\n", - "\n", - "### Set up your Google Cloud project\n", - "\n", - "**The following steps are required, regardless of your notebook environment.**\n", - "\n", - "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", - "\n", - "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", - "\n", - "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WReHDGG5g0XY" - }, - "source": [ - "#### Set your project ID\n", - "\n", - "**If you don't know your project ID**, try the following:\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oM1iC_MfAts1" - }, - "outputs": [], - "source": [ - "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "region" - }, - "source": [ - "#### Region\n", - "\n", - "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "region" - }, - "outputs": [], - "source": [ - "REGION = \"us-central1\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sBCra4QMA2wR" - }, - "source": [ - "### Authenticate your Google Cloud account\n", - "\n", - "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "74ccc9e52986" - }, - "source": [ - "**1. Vertex AI Workbench**\n", - "* Do nothing as you are already authenticated." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "de775a3773ba" - }, - "source": [ - "**2. Local JupyterLab instance, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "254614fa0c46" - }, - "outputs": [], - "source": [ - "# ! gcloud auth login" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ef21552ccea8" - }, - "source": [ - "**3. Colab, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "603adbbf0532" - }, - "outputs": [], - "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f6b2ccc891ed" - }, - "source": [ - "**4. Service account or other**\n", - "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zgPO1eR3CYjk" - }, - "source": [ - "### Create a Cloud Storage bucket\n", - "\n", - "Create a storage bucket to store intermediate artifacts such as datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MzGDU7TWdts_" - }, - "outputs": [], - "source": [ - "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-EcIXiGsCePi" - }, - "source": [ - "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NIq7R4HZCfIc" - }, - "outputs": [], - "source": [ - "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "960505627ddf" - }, - "source": [ - "### Import libraries and define constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PyQmSRbKA8r-" - }, - "outputs": [], - "source": [ - "import bigframes.pandas as bf\n", - "import vertexai\n", - "\n", - "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", - "bf.options.bigquery.project = PROJECT_ID\n", - "\n", - "from bigframes.ml.model_selection import \\\n", - " train_test_split as bf_train_test_split\n", - "\n", - "REMOTE_JOB_NAME = \"sdk2-bigframes-sklearn\"\n", - "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "source": [ - "## Initialize Vertex AI SDK for Python\n", - "\n", - "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "outputs": [], - "source": [ - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=REGION,\n", - " staging_bucket=BUCKET_URI,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "105334524e96" - }, - "source": [ - "## Prepare the dataset\n", - "\n", - "Now load the Iris dataset and split the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b44cdc4e03f1" - }, - "outputs": [], - "source": [ - "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", - "\n", - "species_categories = {\n", - " \"versicolor\": 0,\n", - " \"virginica\": 1,\n", - " \"setosa\": 2,\n", - "}\n", - "df[\"species\"] = df[\"species\"].map(species_categories)\n", - "\n", - "# Assign an index column name\n", - "index_col = \"index\"\n", - "df.index.name = index_col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9cb8616b1997" - }, - "outputs": [], - "source": [ - "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n", - "label_columns = df[[\"species\"]]\n", - "train_X, test_X, train_y, test_y = bf_train_test_split(\n", - " feature_columns, label_columns, test_size=0.2\n", - ")\n", - "\n", - "print(\"X_train size: \", train_X.size)\n", - "print(\"X_test size: \", test_X.size)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8306545fcc57" - }, - "source": [ - "## Feature transformation\n", - "\n", - "Next, you do feature transformations on the data using the Vertex AI remote training service.\n", - "\n", - "First, you re-initialize Vertex AI to enable remote training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "55e701c31036" - }, - "outputs": [], - "source": [ - "# Switch to remote mode for training\n", - "vertexai.preview.init(remote=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4a0e9d59b273" - }, - "source": [ - "### Execute remote job for fit_transform() on training data\n", - "\n", - "Next, indicate that the `StandardScalar` class is to be executed remotely. Then set up the data transform and call the `fit_transform()` method is executed remotely." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "90333089d362" - }, - "outputs": [], - "source": [ - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "# Wrap classes to enable Vertex remote execution\n", - "StandardScaler = vertexai.preview.remote(StandardScaler)\n", - "\n", - "# Instantiate transformer\n", - "transformer = StandardScaler()\n", - "\n", - "# Set training config\n", - "transformer.fit_transform.vertex.remote_config.display_name = (\n", - " f\"{REMOTE_JOB_NAME}-fit-transformer-bigframes\"\n", - ")\n", - "transformer.fit_transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "# Execute transformer on Vertex (train_X is bigframes.dataframe.DataFrame, X_train is np.array)\n", - "X_train = transformer.fit_transform(train_X)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6bf95574c907" - }, - "source": [ - "### Remote transform on test data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "da6eea22a89a" - }, - "outputs": [], - "source": [ - "# Transform test dataset before calculate test score\n", - "transformer.transform.vertex.remote_config.display_name = (\n", - " REMOTE_JOB_NAME + \"-transformer\"\n", - ")\n", - "transformer.transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "# Execute transformer on Vertex (test_X is bigframes.dataframe.DataFrame, X_test is np.array)\n", - "X_test = transformer.transform(test_X)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ddf906c886e4" - }, - "source": [ - "## Remote training\n", - "\n", - "First, train the scikit-learn model as a remote training job:\n", - "\n", - "- Set LogisticRegression for the remote training job.\n", - "- Invoke LogisticRegression locally which will launch the remote training job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "c7b0116fa60c" - }, - "outputs": [], - "source": [ - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "# Wrap classes to enable Vertex remote execution\n", - "LogisticRegression = vertexai.preview.remote(LogisticRegression)\n", - "\n", - "# Instantiate model, warm_start=True for uptraining\n", - "model = LogisticRegression(warm_start=True)\n", - "\n", - "# Set training config\n", - "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-sklearn-model\"\n", - "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "\n", - "# Train model on Vertex\n", - "model.fit(train_X, train_y)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ffe1d5903bcb" - }, - "source": [ - "## Remote prediction\n", - "\n", - "Obtain predictions from the trained model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "d00ce35920fa" - }, - "outputs": [], - "source": [ - "# Remote evaluation\n", - "vertexai.preview.init(remote=True)\n", - "\n", - "# Evaluate model's accuracy score\n", - "predictions = model.predict(test_X)\n", - "\n", - "print(f\"Remote predictions: {predictions}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a8cd6cbd4403" - }, - "source": [ - "## Local evaluation\n", - "\n", - "Score model results locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dc105dafdfb9" - }, - "outputs": [], - "source": [ - "# User must convert bigframes to pandas dataframe for local evaluation\n", - "train_X_pd = train_X.to_pandas().reset_index(drop=True)\n", - "train_y_pd = train_y.to_pandas().reset_index(drop=True)\n", - "\n", - "test_X_pd = test_X.to_pandas().reset_index(drop=True)\n", - "test_y_pd = test_y.to_pandas().reset_index(drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "25fec549de69" - }, - "outputs": [], - "source": [ - "# Switch to local mode for testing\n", - "vertexai.preview.init(remote=False)\n", - "\n", - "# Evaluate model's accuracy score\n", - "print(f\"Train accuracy: {model.score(train_X_pd, train_y_pd)}\")\n", - "\n", - "print(f\"Test accuracy: {model.score(test_X_pd, test_y_pd)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TpV-iwP9qw9c" - }, - "source": [ - "## Cleaning up\n", - "\n", - "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", - "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", - "\n", - "Otherwise, you can delete the individual resources you created in this tutorial:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sx_vKniMq9ZX" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# Delete Cloud Storage objects that were created\n", - "delete_bucket = False\n", - "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", - " ! gsutil -m rm -r $BUCKET_URI" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "sdk2_bigframes_sklearn.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb deleted file mode 100644 index e6843b66b5..0000000000 --- a/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb +++ /dev/null @@ -1,646 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2023 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Train a Tensorflow Keras model with Vertex AI SDK 2.0 and Bigframes \n", - "\n", - "\n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"VertexOpen in Vertex AI Workbench\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "This tutorial demonstrates how to train a tensorflow keras model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", - "\n", - "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d975e698c9a4" - }, - "source": [ - "### Objective\n", - "\n", - "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", - "\n", - "\n", - "This tutorial uses the following Google Cloud ML services:\n", - "\n", - "- `Vertex AI Training`\n", - "- `Vertex AI Remote Training`\n", - "\n", - "\n", - "The steps performed include:\n", - "\n", - "- Initialize a dataframe from a BigQuery table and split the dataset\n", - "- Perform transformations as a Vertex AI remote training.\n", - "- Train the model remotely and evaluate the model locally\n", - "\n", - "**Local-to-remote training**\n", - "\n", - "```\n", - "import vertexai\n", - "from my_module import MyModelClass\n", - "\n", - "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", - "\n", - "# Wrap the model class with `vertex_ai.preview.remote`\n", - "MyModelClass = vertexai.preview.remote(MyModelClass)\n", - "\n", - "# Instantiate the class\n", - "model = MyModelClass(...)\n", - "\n", - "# Optional set remote config\n", - "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", - "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", - "\n", - "# This `fit` call will be executed remotely\n", - "model.fit(...)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "08d289fa873f" - }, - "source": [ - "### Dataset\n", - "\n", - "This tutorial uses the IRIS dataset, which predicts the iris species." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aed92deeb4a0" - }, - "source": [ - "### Costs\n", - "\n", - "This tutorial uses billable components of Google Cloud:\n", - "\n", - "* Vertex AI\n", - "* BigQuery\n", - "* Cloud Storage\n", - "\n", - "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", - "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", - "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", - "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", - "to generate a cost estimate based on your projected usage." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7EUnXsZhAGF" - }, - "source": [ - "## Installation\n", - "\n", - "Install the following packages required to execute this notebook. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2b4ef9b72d43" - }, - "outputs": [], - "source": [ - "# Install the packages\n", - "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", - "! pip3 install --upgrade --quiet bigframes\n", - "! pip3 install --upgrade --quiet tensorflow==2.12.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "58707a750154" - }, - "source": [ - "### Colab only: Uncomment the following cell to restart the kernel." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f200f10a1da3" - }, - "outputs": [], - "source": [ - "# Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BF1j6f9HApxa" - }, - "source": [ - "## Before you begin\n", - "\n", - "### Set up your Google Cloud project\n", - "\n", - "**The following steps are required, regardless of your notebook environment.**\n", - "\n", - "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", - "\n", - "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", - "\n", - "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", - "\n", - "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WReHDGG5g0XY" - }, - "source": [ - "#### Set your project ID\n", - "\n", - "**If you don't know your project ID**, try the following:\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oM1iC_MfAts1" - }, - "outputs": [], - "source": [ - "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "region" - }, - "source": [ - "#### Region\n", - "\n", - "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "region" - }, - "outputs": [], - "source": [ - "REGION = \"us-central1\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sBCra4QMA2wR" - }, - "source": [ - "### Authenticate your Google Cloud account\n", - "\n", - "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "74ccc9e52986" - }, - "source": [ - "**1. Vertex AI Workbench**\n", - "* Do nothing as you are already authenticated." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "de775a3773ba" - }, - "source": [ - "**2. Local JupyterLab instance, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "254614fa0c46" - }, - "outputs": [], - "source": [ - "# ! gcloud auth login" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ef21552ccea8" - }, - "source": [ - "**3. Colab, uncomment and run:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "603adbbf0532" - }, - "outputs": [], - "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f6b2ccc891ed" - }, - "source": [ - "**4. Service account or other**\n", - "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zgPO1eR3CYjk" - }, - "source": [ - "### Create a Cloud Storage bucket\n", - "\n", - "Create a storage bucket to store intermediate artifacts such as datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MzGDU7TWdts_" - }, - "outputs": [], - "source": [ - "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-EcIXiGsCePi" - }, - "source": [ - "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NIq7R4HZCfIc" - }, - "outputs": [], - "source": [ - "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "960505627ddf" - }, - "source": [ - "### Import libraries and define constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PyQmSRbKA8r-" - }, - "outputs": [], - "source": [ - "import bigframes.pandas as bf\n", - "import tensorflow as tf\n", - "import vertexai\n", - "from tensorflow import keras\n", - "\n", - "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", - "bf.options.bigquery.project = PROJECT_ID\n", - "\n", - "from bigframes.ml.model_selection import \\\n", - " train_test_split as bf_train_test_split" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "source": [ - "## Initialize Vertex AI SDK for Python\n", - "\n", - "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "init_aip:mbsdk,all" - }, - "outputs": [], - "source": [ - "vertexai.init(\n", - " project=PROJECT_ID,\n", - " location=REGION,\n", - " staging_bucket=BUCKET_URI,\n", - ")\n", - "\n", - "REMOTE_JOB_NAME = \"sdk2-bigframes-tensorflow\"\n", - "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "105334524e96" - }, - "source": [ - "## Prepare the dataset\n", - "\n", - "Now load the Iris dataset and split the data into train and test sets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "94576deccd8c" - }, - "outputs": [], - "source": [ - "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", - "\n", - "species_categories = {\n", - " \"versicolor\": 0,\n", - " \"virginica\": 1,\n", - " \"setosa\": 2,\n", - "}\n", - "df[\"target\"] = df[\"species\"].map(species_categories)\n", - "df = df.drop(columns=[\"species\"])\n", - "\n", - "train, test = bf_train_test_split(df, test_size=0.2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cfcbce726efa" - }, - "source": [ - "## Remote training with GPU\n", - "\n", - "First, train a TensorFlow model as a remote training job:\n", - "\n", - "- Reinitialize Vertex AI for remote training.\n", - "- Instantiate the tensorflow keras model for the remote training job.\n", - "- Invoke the tensorflow keras model.fit() locally which will launch the remote training job." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fd865b0c4e8b" - }, - "outputs": [], - "source": [ - "# Switch to remote mode for training\n", - "vertexai.preview.init(remote=True)\n", - "\n", - "keras.Sequential = vertexai.preview.remote(keras.Sequential)\n", - "\n", - "# Instantiate model\n", - "model = keras.Sequential(\n", - " [keras.layers.Dense(5, input_shape=(4,)), keras.layers.Softmax()]\n", - ")\n", - "\n", - "# Specify optimizer and loss function\n", - "model.compile(optimizer=\"adam\", loss=\"mean_squared_error\")\n", - "\n", - "# Set training config\n", - "model.fit.vertex.remote_config.enable_cuda = True\n", - "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-model-gpu\"\n", - "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "model.fit.vertex.remote_config.custom_commands = [\"pip install tensorflow-io==0.32.0\"]\n", - "\n", - "# Manually set compute resources this time\n", - "model.fit.vertex.remote_config.machine_type = \"n1-highmem-4\"\n", - "model.fit.vertex.remote_config.accelerator_type = \"NVIDIA_TESLA_K80\"\n", - "model.fit.vertex.remote_config.accelerator_count = 4\n", - "\n", - "# Train model on Vertex\n", - "model.fit(train, epochs=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f1af94ac1477" - }, - "source": [ - "## Remote prediction\n", - "\n", - "Obtain predictions from the trained model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1d75879948b5" - }, - "outputs": [], - "source": [ - "vertexai.preview.init(remote=True)\n", - "\n", - "# Set remote config\n", - "model.predict.vertex.remote_config.enable_cuda = False\n", - "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-predict-cpu\"\n", - "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", - "model.predict.vertex.remote_config.custom_commands = [\n", - " \"pip install tensorflow-io==0.32.0\"\n", - "]\n", - "\n", - "predictions = model.predict(train)\n", - "\n", - "print(f\"Remote predictions: {predictions}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "798b77c95067" - }, - "source": [ - "## Local evaluation\n", - "\n", - "Evaluate model results locally." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "88e734e30791" - }, - "outputs": [], - "source": [ - "# User must convert bigframes to pandas dataframe for local evaluation\n", - "feature_columns = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", - "label_columns = [\"target\"]\n", - "\n", - "train_X_np = train[feature_columns].to_pandas().values.astype(float)\n", - "train_y_np = train[label_columns].to_pandas().values.astype(float)\n", - "train_ds = tf.data.Dataset.from_tensor_slices((train_X_np, train_y_np))\n", - "\n", - "test_X_np = test[feature_columns].to_pandas().values.astype(float)\n", - "test_y_np = test[label_columns].to_pandas().values.astype(float)\n", - "test_ds = tf.data.Dataset.from_tensor_slices((test_X_np, test_y_np))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cb8637f783ad" - }, - "outputs": [], - "source": [ - "# Switch to local mode for evaluation\n", - "vertexai.preview.init(remote=False)\n", - "\n", - "# Evaluate model's mean square errors\n", - "print(f\"Train loss: {model.evaluate(train_ds.batch(32))}\")\n", - "\n", - "print(f\"Test loss: {model.evaluate(test_ds.batch(32))}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TpV-iwP9qw9c" - }, - "source": [ - "## Cleaning up\n", - "\n", - "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", - "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", - "\n", - "Otherwise, you can delete the individual resources you created in this tutorial:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sx_vKniMq9ZX" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# Delete Cloud Storage objects that were created\n", - "delete_bucket = False\n", - "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", - " ! gsutil -m rm -r $BUCKET_URI" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "sdk2_bigframes_tensorflow.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/noxfile.py b/noxfile.py index 6b36995480..4ac3a81723 100644 --- a/noxfile.py +++ b/noxfile.py @@ -341,8 +341,8 @@ def run_system( pytest_cmd.extend(extra_pytest_options) session.run( *pytest_cmd, - test_folder, *session.posargs, + test_folder, ) @@ -399,7 +399,7 @@ def load(session: nox.sessions.Session): prefix_name="load", test_folder=os.path.join("tests", "system", "load"), print_duration=True, - timeout_seconds=60 * 60, + timeout_seconds=60 * 60 * 12, ) @@ -467,6 +467,12 @@ def docs(session): ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + + session.run( + "python", + "scripts/publish_api_coverage.py", + "docs", + ) session.run( "sphinx-build", "-W", # warnings as errors @@ -503,6 +509,12 @@ def docfx(session): ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + + session.run( + "python", + "scripts/publish_api_coverage.py", + "docs", + ) session.run( "sphinx-build", "-T", # show full traceback on exception diff --git a/samples/snippets/gemini_model_test.py b/samples/snippets/gemini_model_test.py new file mode 100644 index 0000000000..89212875ae --- /dev/null +++ b/samples/snippets/gemini_model_test.py @@ -0,0 +1,44 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_gemini_text_generator_model(): + # Determine project id, in this case prefer the one set in the environment + # variable GOOGLE_CLOUD_PROJECT (if any) + import os + + PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev") + REGION = "us" + CONN_NAME = "bigframes-default-connection" + + # [START bigquery_dataframes_gemini_model] + from bigframes.ml.llm import GeminiTextGenerator + import bigframes.pandas as bpd + + # Create the Gemini LLM model + session = bpd.get_global_session() + connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}" + model = GeminiTextGenerator(session=session, connection_name=connection) + + df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv") + + # Prepare the prompts and send them to the LLM model for prediction + df_prompt_prefix = "Generate Pandas sample code for DataFrame." + df_prompt = df_prompt_prefix + df_api["API"] + + # Predict using the model + df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024) + # [END bigquery_dataframes_gemini_model] + assert df_pred["ml_generate_text_llm_result"] is not None + assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index a15ea16853..ae3a934004 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -29,8 +29,9 @@ def run_quickstart(project_id: str): import bigframes.pandas as bpd # Set BigQuery DataFrames options + # Note: The project option is not required in all environments. + # On BigQuery Studio, the project ID is automatically detected. bpd.options.bigquery.project = your_gcp_project_id - bpd.options.bigquery.location = "us" # Create a DataFrame from a BigQuery table query_or_table = "bigquery-public-data.ml_datasets.penguins" diff --git a/samples/snippets/set_options_test.py b/samples/snippets/set_options_test.py index ef6f41ce54..f981009e9a 100644 --- a/samples/snippets/set_options_test.py +++ b/samples/snippets/set_options_test.py @@ -26,7 +26,14 @@ def test_bigquery_dataframes_set_options(): REGION = "US" # @param {type:"string"} # Set BigQuery DataFrames options + # Note: The project option is not required in all environments. + # On BigQuery Studio, the project ID is automatically detected. bpd.options.bigquery.project = PROJECT_ID + + # Note: The location option is not required. + # It defaults to the location of the first table or query + # passed to read_gbq(). For APIs where a location can't be + # auto-detected, the location defaults to the "US" location. bpd.options.bigquery.location = REGION # [END bigquery_dataframes_set_options] diff --git a/scripts/get_code_sample_coverage.py b/scripts/get_documentation_coverage.py similarity index 74% rename from scripts/get_code_sample_coverage.py rename to scripts/get_documentation_coverage.py index d81023394f..0b9417b2d3 100755 --- a/scripts/get_code_sample_coverage.py +++ b/scripts/get_documentation_coverage.py @@ -16,7 +16,7 @@ import importlib import inspect import sys -from typing import Dict, List +import typing import bigframes import bigframes.pandas as bpd @@ -50,6 +50,11 @@ "remote", ] +COVERAGE_GENERATORS = { + "documentation": lambda docstr: docstr, + "code samples": lambda docstr: docstr and "**Examples:**" in docstr, +} + for module_name in ML_MODULE_NAMES: module = importlib.import_module(f"bigframes.ml.{module_name}") classes_ = [ @@ -58,9 +63,15 @@ CLASSES.extend(classes_) -def get_code_samples_summary() -> Dict[str, Dict[str, List[str]]]: +def get_coverage_summary( + func: typing.Callable, +) -> typing.Dict[str, typing.Dict[str, typing.List[str]]]: """Get Summary of the code samples coverage in BigFrames APIs. + Args: + func (callable): + Function to accept documentation and return whether it satisfies + coverage. Returns: Summary: A dictionary of the format { @@ -73,7 +84,7 @@ def get_code_samples_summary() -> Dict[str, Dict[str, List[str]]]: } } """ - summary: Dict[str, Dict[str, List[str]]] = dict() + summary: typing.Dict[str, typing.Dict[str, typing.List[str]]] = dict() for class_ in CLASSES: class_key = f"{class_.__module__}.{class_.__name__}" @@ -104,8 +115,8 @@ def predicate(impl): impl = getattr(class_, name) docstr = inspect.getdoc(impl) - code_samples_present = docstr and "**Examples:**" in docstr - key = PRESENT if code_samples_present else NOT_PRESENT + coverage_present = func(docstr) + key = PRESENT if coverage_present else NOT_PRESENT summary[class_key][key].append(name) return summary @@ -113,7 +124,16 @@ def predicate(impl): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Get a summary of code samples coverage in BigFrames APIs." + description="Get a summary of documentation coverage in BigFrames APIs." + ) + parser.add_argument( + "-c", + "--code-samples", + type=bool, + action=argparse.BooleanOptionalAction, + default=False, + help="Whether to calculate code samples coverage. By default the tool" + " calculates the documentation (docstring) coverage.", ) parser.add_argument( "-d", @@ -121,12 +141,13 @@ def predicate(impl): type=bool, action=argparse.BooleanOptionalAction, default=False, - help="Whether to print APIs with and without code samples.", + help="Whether to print APIs with and without the coverage.", ) args = parser.parse_args(sys.argv[1:]) - summary = get_code_samples_summary() + scenario = "code samples" if args.code_samples else "documentation" + summary = get_coverage_summary(COVERAGE_GENERATORS[scenario]) total_with_code_samples = 0 total = 0 @@ -140,8 +161,8 @@ def predicate(impl): coverage = 100 * apis_with_code_samples / apis_total print(f"{class_}: {coverage:.1f}% ({apis_with_code_samples}/{apis_total})") if args.details: - print(f"===> APIs WITH code samples: {class_summary[PRESENT]}") - print(f"===> APIs WITHOUT code samples: {class_summary[NOT_PRESENT]}") + print(f"===> APIs WITH {scenario}: {class_summary[PRESENT]}") + print(f"===> APIs WITHOUT {scenario}: {class_summary[NOT_PRESENT]}") coverage = 100 * total_with_code_samples / total print(f"Total: {coverage:.1f}% ({total_with_code_samples}/{total})") diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 856307e440..25fbfbf988 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -17,27 +17,112 @@ import argparse import inspect +import pathlib +import sys import pandas as pd +import pandas.core.groupby +import pandas.core.indexes.accessors +import pandas.core.strings.accessor +import pandas.core.window.rolling +import bigframes +import bigframes.core.groupby +import bigframes.core.window +import bigframes.operations.datetimes import bigframes.pandas as bpd +REPO_ROOT = pathlib.Path(__file__).parent.parent + +URL_PREFIX = { + "pandas": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_" + ), + "dataframe": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_" + ), + "dataframegroupby": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.DataFrameGroupBy#bigframes_core_groupby_DataFrameGroupBy_" + ), + "index": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.indexes.base.Index#bigframes_core_indexes_base_Index_" + ), + "series": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_" + ), + "seriesgroupby": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.SeriesGroupBy#bigframes_core_groupby_SeriesGroupBy_" + ), + "datetimemethods": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.datetimes.DatetimeMethods#bigframes_operations_datetimes_DatetimeMethods_" + ), + "stringmethods": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.strings.StringMethods#bigframes_operations_strings_StringMethods_" + ), + "window": ( + "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.window.Window#bigframes_core_window_Window_" + ), +} + + +PANDAS_TARGETS = [ + ("pandas", pd, bpd), + ("dataframe", pd.DataFrame, bpd.DataFrame), + ( + "dataframegroupby", + pandas.core.groupby.DataFrameGroupBy, + bigframes.core.groupby.DataFrameGroupBy, + ), + ("series", pd.Series, bpd.Series), + ( + "seriesgroupby", + pandas.core.groupby.DataFrameGroupBy, + bigframes.core.groupby.DataFrameGroupBy, + ), + ( + "datetimemethods", + pandas.core.indexes.accessors.CombinedDatetimelikeProperties, + bigframes.operations.datetimes.DatetimeMethods, + ), + ( + "stringmethods", + pandas.core.strings.accessor.StringMethods, + bigframes.operations.strings.StringMethods, + ), + ( + "window", + pandas.core.window.rolling.Rolling, + bigframes.core.window.Window, + ), + ("index", pd.Index, bpd.Index), +] + + +def names_from_signature(signature): + """Extract the names of parameters from signature + + See: https://docs.python.org/3/library/inspect.html#inspect.signature + """ + return frozenset({parameter for parameter in signature.parameters}) + + +def calculate_missing_parameters(bigframes_function, target_function): + bigframes_params = names_from_signature(inspect.signature(bigframes_function)) + target_params = names_from_signature(inspect.signature(target_function)) + return target_params - bigframes_params + def generate_pandas_api_coverage(): """Inspect all our pandas objects, and compare with the real pandas objects, to see which methods we implement. For each, generate a regex that can be used to check if its present in a notebook""" - header = ["api", "pattern", "kind", "is_in_bigframes"] + header = ["api", "pattern", "kind", "is_in_bigframes", "missing_parameters"] api_patterns = [] - targets = [ - ("pandas", pd, bpd), - ("dataframe", pd.DataFrame, bpd.DataFrame), - ("series", pd.Series, bpd.Series), - ("index", pd.Index, bpd.Index), - ] indexers = ["loc", "iloc", "iat", "ix", "at"] - for name, pandas_obj, bigframes_obj in targets: + for name, pandas_obj, bigframes_obj in PANDAS_TARGETS: for member in dir(pandas_obj): + missing_parameters = "" + # skip private functions and properties if member[0] == "_" and member[1] != "_": continue @@ -50,6 +135,17 @@ def generate_pandas_api_coverage(): # Function, match .member( token = f"\\.{member}\\(" token_type = "function" + + if hasattr(bigframes_obj, member): + bigframes_function = getattr(bigframes_obj, member) + pandas_function = getattr(pandas_obj, member) + missing_parameters = ", ".join( + sorted( + calculate_missing_parameters( + bigframes_function, pandas_function + ) + ) + ) elif member in indexers: # Indexer, match .indexer[ token = f"\\.{member}\\[" @@ -62,7 +158,13 @@ def generate_pandas_api_coverage(): is_in_bigframes = hasattr(bigframes_obj, member) api_patterns.append( - [f"{name}.{member}", token, token_type, is_in_bigframes] + [ + f"{name}.{member}", + token, + token_type, + is_in_bigframes, + missing_parameters, + ] ) return pd.DataFrame(api_patterns, columns=header) @@ -165,14 +267,112 @@ def build_api_coverage_table(bigframes_version: str, release_version: str): return combined_df.infer_objects().convert_dtypes() +def format_api(api_names, is_in_bigframes, api_prefix): + api_names = api_names.str.slice(start=len(f"{api_prefix}.")) + formatted = "" + api_names + "" + url_prefix = URL_PREFIX.get(api_prefix) + if url_prefix is None: + return formatted + + linked = '' + formatted + "" + return formatted.mask(is_in_bigframes, linked) + + +def generate_api_coverage(df, api_prefix): + dataframe_apis = df.loc[df["api"].str.startswith(f"{api_prefix}.")] + fully_implemented = ( + dataframe_apis["missing_parameters"].str.len() == 0 + ) & dataframe_apis["is_in_bigframes"] + partial_implemented = ( + dataframe_apis["missing_parameters"].str.len() != 0 + ) & dataframe_apis["is_in_bigframes"] + not_implemented = ~dataframe_apis["is_in_bigframes"] + dataframe_table = pd.DataFrame( + { + "API": format_api( + dataframe_apis["api"], + dataframe_apis["is_in_bigframes"], + api_prefix, + ), + "Implemented": "", + "Missing parameters": dataframe_apis["missing_parameters"], + } + ) + dataframe_table.loc[fully_implemented, "Implemented"] = "Y" + dataframe_table.loc[partial_implemented, "Implemented"] = "P" + dataframe_table.loc[not_implemented, "Implemented"] = "N" + return dataframe_table + + +def generate_api_coverage_doc(df, api_prefix): + dataframe_table = generate_api_coverage(df, api_prefix) + dataframe_table = dataframe_table.loc[~(dataframe_table["Implemented"] == "N")] + dataframe_table["Implemented"] = dataframe_table["Implemented"].map( + { + "Y": "Y", + "P": "P", + } + ) + + with open( + REPO_ROOT / "docs" / "supported_pandas_apis" / f"bf_{api_prefix}.html", + "w", + ) as html_file: + dataframe_table.to_html( + html_file, index=False, header=True, escape=False, border=0, col_space="8em" + ) + + +def generate_api_coverage_docs(df): + for target in PANDAS_TARGETS: + api_prefix = target[0] + generate_api_coverage_doc(df, api_prefix) + + +def print_api_coverage_summary(df, api_prefix): + dataframe_table = generate_api_coverage(df, api_prefix) + + print(api_prefix) + print(dataframe_table[["Implemented", "API"]].groupby(["Implemented"]).count()) + print(f"{api_prefix} APIs: {dataframe_table.shape[0]}\n") + + +def print_api_coverage_summaries(df): + for target in PANDAS_TARGETS: + api_prefix = target[0] + print_api_coverage_summary(df, api_prefix) + + print(f"\nAll APIs: {len(df.index)}") + fully_implemented = (df["missing_parameters"].str.len() == 0) & df[ + "is_in_bigframes" + ] + print(f"Y: {fully_implemented.sum()}") + partial_implemented = (df["missing_parameters"].str.len() != 0) & df[ + "is_in_bigframes" + ] + print(f"P: {partial_implemented.sum()}") + not_implemented = ~df["is_in_bigframes"] + print(f"N: {not_implemented.sum()}") + + def main(): parser = argparse.ArgumentParser() - parser.add_argument("--bigframes_version") - parser.add_argument("--release_version") + parser.add_argument("output_type") + parser.add_argument("--bigframes_version", default=bigframes.__version__) + parser.add_argument("--release_version", default="") parser.add_argument("--bigquery_table_name") args = parser.parse_args() df = build_api_coverage_table(args.bigframes_version, args.release_version) - df.to_gbq(args.bigquery_table_name, if_exists="append") + + if args.output_type == "bigquery": + df.to_gbq(args.bigquery_table_name, if_exists="append") + elif args.output_type == "docs": + generate_api_coverage_docs(df) + elif args.output_type == "summary": + print_api_coverage_summaries(df) + else: + print(f"Unexpected output_type {repr(args.output_type)}") + sys.exit(1) if __name__ == "__main__": diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 96b2d1bb48..061cc1c25c 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -27,6 +27,7 @@ def test_api_coverage_produces_expected_schema(): "string", "boolean", "string", + "string", "datetime64[ns]", "string", "string", @@ -36,6 +37,7 @@ def test_api_coverage_produces_expected_schema(): "pattern", "kind", "is_in_bigframes", + "missing_parameters", "module", "timestamp", "bigframes_version", diff --git a/setup.py b/setup.py index 768fac530c..83049f9715 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ # 'Development Status :: 3 - Alpha' # 'Development Status :: 4 - Beta' # 'Development Status :: 5 - Production/Stable' -release_status = "Development Status :: 3 - Alpha" +release_status = "Development Status :: 5 - Production/Stable" dependencies = [ # please keep these in sync with the minimum versions in testing/constraints-3.9.txt "cloudpickle >= 2.0.0", @@ -47,6 +47,7 @@ "ibis-framework[bigquery] >=8.0.0,<9.0.0dev", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "pandas >=1.5.0", + "pyarrow >=8.0.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", "scikit-learn >=1.2.2", diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index 9f0786f47e..5782b03a2f 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -1,15 +1,17 @@ # Keep in sync with colab/containers/requirements.core.in image -google-auth==2.17.3 +google-auth==2.27.0 ipykernel==5.5.6 ipython==7.34.0 -notebook==6.4.8 -pandas==1.5.3 -portpicker==1.3.9 -requests==2.27.1 -tornado==6.3.1 +notebook==6.5.5 +pandas==2.0.3 +pandas-stubs==2.0.3.230814 +portpicker==1.5.2 +requests==2.31.0 +tornado==6.3.3 absl-py==1.4.0 debugpy==1.6.6 ipywidgets==7.7.1 matplotlib==3.7.1 psutil==5.9.5 +seaborn==0.13.1 traitlets==5.7.1 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 0aeb15eab8..1e1f3a3e66 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -12,6 +12,7 @@ google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 ibis-framework==8.0.0 pandas==1.5.0 +pyarrow==8.0.0 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 diff --git a/tests/config.py b/tests/config.py new file mode 100644 index 0000000000..a885d7e71d --- /dev/null +++ b/tests/config.py @@ -0,0 +1,72 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# https://cloud.google.com/bigquery/docs/locations +ALL_BIGQUERY_LOCATIONS = [ + "us-east5", + "us-south1", + "us-central1", + "us-west4", + "us-west2", + "northamerica-northeast1", + "us-east4", + "us-west1", + "us-west3", + "southamerica-east1", + "southamerica-west1", + "us-east1", + "northamerica-northeast2", + "asia-south2", + "asia-east2", + "asia-southeast2", + "australia-southeast2", + "asia-south1", + "asia-northeast2", + "asia-northeast3", + "asia-southeast1", + "australia-southeast1", + "asia-east1", + "asia-northeast1", + "europe-west1", + "europe-west10", + "europe-north1", + "europe-west3", + "europe-west2", + "europe-southwest1", + "europe-west8", + "europe-west4", + "europe-west9", + "europe-west12", + "europe-central2", + "europe-west6", + "me-central2", + "me-central1", + "me-west1", + "me-central2", + "me-central1", + "me-west1", + "africa-south1", +] + +REP_ENABLED_BIGQUERY_LOCATIONS = [ + "me-central2", + "europe-west9", + "europe-west3", + "us-east4", + "us-west1", +] + +LEP_ENABLED_BIGQUERY_LOCATIONS = sorted( + set(ALL_BIGQUERY_LOCATIONS) - set(REP_ENABLED_BIGQUERY_LOCATIONS) +) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index a108ff4a8e..70ff6eee39 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -953,6 +953,14 @@ def restore_sampling_settings(): bigframes.options.sampling.max_download_size = max_download_size +@pytest.fixture() +def with_multiquery_execution(): + original_setting = bigframes.options.compute.enable_multi_query_execution + bigframes.options.compute.enable_multi_query_execution = True + yield + bigframes.options.compute.enable_multi_query_execution = original_setting + + @pytest.fixture() def weird_strings_pd(): df = pd.DataFrame( diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 6ea4f72489..7513b78b29 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -14,28 +14,29 @@ import pandas -import bigframes.ml.cluster -import bigframes.ml.compose -import bigframes.ml.linear_model -import bigframes.ml.pipeline -import bigframes.ml.preprocessing +from bigframes.ml import compose, preprocessing def test_columntransformer_standalone_fit_and_transform( penguins_df_default_index, new_penguins_df ): - transformer = bigframes.ml.compose.ColumnTransformer( + transformer = compose.ColumnTransformer( [ ( "onehot", - bigframes.ml.preprocessing.OneHotEncoder(), + preprocessing.OneHotEncoder(), "species", ), ( - "scale", - bigframes.ml.preprocessing.StandardScaler(), + "starndard_scale", + preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "min_max_scale", + preprocessing.MinMaxScaler(), + ["culmen_length_mm"], + ), ] ) @@ -44,11 +45,6 @@ def test_columntransformer_standalone_fit_and_transform( ) result = transformer.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pandas.DataFrame( { "onehotencoded_species": [ @@ -61,31 +57,26 @@ def test_columntransformer_standalone_fit_and_transform( -0.9945520581113803, -1.104611490204711, ], + "min_max_scaled_culmen_length_mm": [0.269, 0.232, 0.210], "standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], }, index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), ) - expected.standard_scaled_culmen_length_mm = ( - expected.standard_scaled_culmen_length_mm.astype("Float64") - ) - expected.standard_scaled_flipper_length_mm = ( - expected.standard_scaled_flipper_length_mm.astype("Float64") - ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) + pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) def test_columntransformer_standalone_fit_transform(new_penguins_df): - transformer = bigframes.ml.compose.ColumnTransformer( + transformer = compose.ColumnTransformer( [ ( "onehot", - bigframes.ml.preprocessing.OneHotEncoder(), + preprocessing.OneHotEncoder(), "species", ), ( - "scale", - bigframes.ml.preprocessing.StandardScaler(), + "standard_scale", + preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), ] @@ -95,11 +86,6 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] ).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pandas.DataFrame( { "onehotencoded_species": [ @@ -116,11 +102,66 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): }, index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), ) - expected.standard_scaled_culmen_length_mm = ( - expected.standard_scaled_culmen_length_mm.astype("Float64") + + pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) + + +def test_columntransformer_save_load(new_penguins_df, dataset_id): + transformer = compose.ColumnTransformer( + [ + ( + "onehot", + preprocessing.OneHotEncoder(), + "species", + ), + ( + "standard_scale", + preprocessing.StandardScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), + ] + ) + transformer.fit( + new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True ) - expected.standard_scaled_flipper_length_mm = ( - expected.standard_scaled_flipper_length_mm.astype("Float64") + + assert isinstance(reloaded_transformer, compose.ColumnTransformer) + + expected = [ + ( + "one_hot_encoder", + preprocessing.OneHotEncoder(max_categories=1000001, min_frequency=0), + "species", + ), + ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), + ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), + ] + assert reloaded_transformer.transformers_ == expected + assert reloaded_transformer._bqml_model is not None + + result = transformer.fit_transform( + new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pandas.DataFrame( + { + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + "standard_scaled_culmen_length_mm": [ + 1.313249, + -0.20198, + -1.111118, + ], + "standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338], + }, + index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) + pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False) diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index c128469bd2..c165b1e030 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -222,7 +222,7 @@ def test_pipeline_logistic_regression_fit_score_predict( ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_index): """Test a supervised model with a minimal preprocessing step""" pl = pipeline.Pipeline( @@ -297,7 +297,7 @@ def test_pipeline_xgbregressor_fit_score_predict(session, penguins_df_default_in ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_random_forest_classifier_fit_score_predict( session, penguins_df_default_index ): @@ -445,7 +445,7 @@ def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index): ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_pipeline_standard_scaler_kmeans_fit_score_predict( session, penguins_pandas_df_default_index ): @@ -646,7 +646,7 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id compose.ColumnTransformer( [ ( - "ont_hot_encoder", + "one_hot_encoder", preprocessing.OneHotEncoder( drop="most_frequent", min_frequency=5, @@ -699,7 +699,7 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id transformers = pl_loaded._transform.transformers_ expected = [ ( - "ont_hot_encoder", + "one_hot_encoder", preprocessing.OneHotEncoder( drop="most_frequent", max_categories=100, min_frequency=5 ), diff --git a/tests/system/large/test_location.py b/tests/system/large/test_location.py new file mode 100644 index 0000000000..a4cf8919a0 --- /dev/null +++ b/tests/system/large/test_location.py @@ -0,0 +1,129 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing + +from google.cloud import bigquery +import pytest + +import bigframes +import bigframes.session.clients +from tests import config + + +def _assert_bq_execution_location(session: bigframes.Session): + df = session.read_gbq( + """ + SELECT "aaa" as name, 111 as number + UNION ALL + SELECT "bbb" as name, 222 as number + UNION ALL + SELECT "aaa" as name, 333 as number + """ + ) + + assert ( + typing.cast(bigquery.QueryJob, df.query_job).location + == session.bqclient.location + ) + + result = ( + df[["name", "number"]] + .groupby("name") + .sum(numeric_only=True) + .sort_values("number", ascending=False) + .head() + ) + + assert ( + typing.cast(bigquery.QueryJob, result.query_job).location + == session.bqclient.location + ) + + +def test_bq_location_default(): + session = bigframes.Session() + + assert session.bqclient.location == "US" + + # by default global endpoint is used + assert ( + session.bqclient._connection.API_BASE_URL == "https://bigquery.googleapis.com" + ) + + # assert that bigframes session honors the location + _assert_bq_execution_location(session) + + +@pytest.mark.parametrize("bigquery_location", config.ALL_BIGQUERY_LOCATIONS) +def test_bq_location(bigquery_location): + session = bigframes.Session( + context=bigframes.BigQueryOptions(location=bigquery_location) + ) + + assert session.bqclient.location == bigquery_location + + # by default global endpoint is used + assert ( + session.bqclient._connection.API_BASE_URL == "https://bigquery.googleapis.com" + ) + + # assert that bigframes session honors the location + _assert_bq_execution_location(session) + + +@pytest.mark.parametrize( + "bigquery_location", + config.REP_ENABLED_BIGQUERY_LOCATIONS, +) +def test_bq_rep_endpoints(bigquery_location): + session = bigframes.Session( + context=bigframes.BigQueryOptions( + location=bigquery_location, use_regional_endpoints=True + ) + ) + + assert session.bqclient.location == bigquery_location + assert ( + session.bqclient._connection.API_BASE_URL + == "https://bigquery.{location}.rep.googleapis.com".format( + location=bigquery_location + ) + ) + + # assert that bigframes session honors the location + _assert_bq_execution_location(session) + + +@pytest.mark.parametrize( + "bigquery_location", + config.LEP_ENABLED_BIGQUERY_LOCATIONS, +) +def test_bq_lep_endpoints(bigquery_location): + # We are not testing BigFrames Session for LEP endpoints because it involves + # query execution using the endpoint, which requires the project to be + # allowlisted for LEP access. We could hardcode one project which is + # allowlisted but then not every open source developer will have access to + # that. Let's rely on just creating the clients for LEP. + clients_provider = bigframes.session.clients.ClientsProvider( + location=bigquery_location, use_regional_endpoints=True + ) + + assert clients_provider.bqclient.location == bigquery_location + assert ( + clients_provider.bqclient._connection.API_BASE_URL + == "https://{location}-bigquery.googleapis.com".format( + location=bigquery_location + ) + ) diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py index 1d4a6b0a5b..22baa2268f 100644 --- a/tests/system/load/test_large_tables.py +++ b/tests/system/load/test_large_tables.py @@ -74,13 +74,9 @@ def test_index_repr_large_table(): assert actual is not None -# FAILED -# tests/system/load/test_large_tables.py::test_to_pandas_batches_large_table -# google.api_core.exceptions.Forbidden: 403 Response too large to return. -# Consider specifying a destination table in your job... -@pytest.mark.xfail def test_to_pandas_batches_large_table(): - df = bpd.read_gbq("load_testing.scalars_100gb") + df = bpd.read_gbq("load_testing.scalars_10gb") + # df will be downloaded locally expected_row_count, expected_column_count = df.shape row_count = 0 diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index c9100f36f3..33351afe45 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -47,12 +47,11 @@ def penguins_bqml_linear_model(session, penguins_linear_model_name) -> core.Bqml @pytest.fixture(scope="function") def ephemera_penguins_bqml_linear_model( - penguins_bqml_linear_model, + session: bigframes.Session, + penguins_bqml_linear_model: core.BqmlModel, ) -> core.BqmlModel: model = penguins_bqml_linear_model - return model.copy( - f"{model._model.project}.{model._model.dataset_id}.{uuid.uuid4().hex}" - ) + return model.copy(f"{session._anonymous_dataset}.{uuid.uuid4().hex}") @pytest.fixture(scope="session") diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 02030cd31e..c505057d7b 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -333,7 +333,7 @@ def test_remote_model_predict( ) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_model_generate_text( bqml_palm2_text_generator_model: core.BqmlModel, llm_text_df ): diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 4d2ddfe513..b9e4889801 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -49,7 +49,7 @@ def test_create_text_generator_32k_model( assert reloaded_model.connection_name == bq_connection -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_model_default_session( bq_connection, llm_text_pandas_df, bigquery_client ): @@ -76,7 +76,7 @@ def test_create_text_generator_model_default_session( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_32k_model_default_session( bq_connection, llm_text_pandas_df, bigquery_client ): @@ -103,7 +103,7 @@ def test_create_text_generator_32k_model_default_session( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_create_text_generator_model_default_connection( llm_text_pandas_df, bigquery_client ): @@ -131,7 +131,7 @@ def test_create_text_generator_model_default_connection( # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df ): @@ -142,7 +142,7 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df ): @@ -153,7 +153,7 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df ): @@ -165,7 +165,7 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df ): @@ -255,7 +255,7 @@ def test_create_text_embedding_generator_multilingual_model_defaults(bq_connecti assert model._bqml_model is not None -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df ): @@ -267,7 +267,7 @@ def test_embedding_generator_predict_success( assert len(value) == 768 -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_multilingual_predict_success( palm2_embedding_generator_multilingual_model, llm_text_df ): @@ -279,7 +279,7 @@ def test_embedding_generator_multilingual_predict_success( assert len(value) == 768 -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df ): @@ -306,7 +306,7 @@ def test_create_gemini_text_generator_model( assert reloaded_model.connection_name == bq_connection -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_default_params_success( gemini_text_generator_model, llm_text_df ): @@ -317,7 +317,7 @@ def test_gemini_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_with_params_success( gemini_text_generator_model, llm_text_df ): diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index b40982e282..c4c7eb4b88 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -19,7 +19,8 @@ import pytest import sklearn.metrics as sklearn_metrics # type: ignore -import bigframes.ml.metrics +import bigframes +from bigframes.ml import metrics def test_r2_score_perfect_fit(session): @@ -32,9 +33,7 @@ def test_r2_score_perfect_fit(session): df = session.read_pandas(pd_df) assert ( - bigframes.ml.metrics.r2_score( - df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] - ) + metrics.r2_score(df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]]) == 1.0 ) @@ -43,7 +42,7 @@ def test_r2_score_bad_fit(session): pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [5, 4, 3, 2, 1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred"]]) == -3.0 + assert metrics.r2_score(df[["y_true"]], df[["y_pred"]]) == -3.0 def test_r2_score_force_finite(session): @@ -56,23 +55,21 @@ def test_r2_score_force_finite(session): ) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.r2_score( + assert metrics.r2_score( df[["y_true"]], df[["y_pred_1"]], force_finite=False ) == float("-inf") - assert bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred_1"]]) == 0.0 + assert metrics.r2_score(df[["y_true"]], df[["y_pred_1"]]) == 0.0 assert math.isnan( - bigframes.ml.metrics.r2_score( - df[["y_true"]], df[["y_pred_2"]], force_finite=False - ) + metrics.r2_score(df[["y_true"]], df[["y_pred_2"]], force_finite=False) ) - assert bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred_2"]]) == 1.0 + assert metrics.r2_score(df[["y_true"]], df[["y_pred_2"]]) == 1.0 def test_r2_score_ok_fit_matches_sklearn(session): pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) - bf_result = bigframes.ml.metrics.r2_score(df[["y_true"]], df[["y_pred"]]) + bf_result = metrics.r2_score(df[["y_true"]], df[["y_pred"]]) sklearn_result = sklearn_metrics.r2_score(pd_df[["y_true"]], pd_df[["y_pred"]]) assert math.isclose(bf_result, sklearn_result) @@ -81,7 +78,7 @@ def test_r2_score_series(session): pd_df = pd.DataFrame({"y_true": [1, 7, 3, 2, 5], "y_pred": [1, 7, 3, 2, 5]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.r2_score(df["y_true"], df["y_pred"]) == 1.0 + assert metrics.r2_score(df["y_true"], df["y_pred"]) == 1.0 def test_accuracy_score_perfect_fit(session): @@ -94,7 +91,7 @@ def test_accuracy_score_perfect_fit(session): df = session.read_pandas(pd_df) assert ( - bigframes.ml.metrics.accuracy_score( + metrics.accuracy_score( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] ) == 1.0 @@ -105,26 +102,21 @@ def test_accuracy_score_bad_fit(session): pd_df = pd.DataFrame({"y_true": [0, 2, 1, 3, 4], "y_pred": [0, 1, 2, 3, 4]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) == 0.6 + assert metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) == 0.6 def test_accuracy_score_not_normailze(session): pd_df = pd.DataFrame({"y_true": [0, 2, 1, 3, 4], "y_pred": [0, 1, 2, 3, 4]}) df = session.read_pandas(pd_df) - assert ( - bigframes.ml.metrics.accuracy_score( - df[["y_true"]], df[["y_pred"]], normalize=False - ) - == 3 - ) + assert metrics.accuracy_score(df[["y_true"]], df[["y_pred"]], normalize=False) == 3 def test_accuracy_score_fit_matches_sklearn(session): pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) - bf_result = bigframes.ml.metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) + bf_result = metrics.accuracy_score(df[["y_true"]], df[["y_pred"]]) sklearn_result = sklearn_metrics.accuracy_score( pd_df[["y_true"]], pd_df[["y_pred"]] ) @@ -135,7 +127,7 @@ def test_accuracy_score_series(session): pd_df = pd.DataFrame({"y_true": [1, 7, 3, 2, 5], "y_pred": [1, 7, 3, 2, 5]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.accuracy_score(df["y_true"], df["y_pred"]) == 1.0 + assert metrics.accuracy_score(df["y_true"], df["y_pred"]) == 1.0 def test_roc_curve_binary_classification_prediction_returns_expected(session): @@ -158,7 +150,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df[["y_true_arbitrary_name"]], df[["y_score_arbitrary_name"]], drop_intermediate=False, @@ -219,7 +211,7 @@ def test_roc_curve_binary_classification_prediction_matches_sklearn(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df[["y_true"]], df[["y_score"]], drop_intermediate=False ) expected_fpr, expected_tpr, expected_thresholds = sklearn_metrics.roc_curve( @@ -259,7 +251,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df[["y_true"]], df[["y_score"]], drop_intermediate=False ) @@ -314,7 +306,7 @@ def test_roc_curve_binary_classification_decision_matches_sklearn(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df[["y_true"]], df[["y_score"]], drop_intermediate=False ) expected_fpr, expected_tpr, expected_thresholds = sklearn_metrics.roc_curve( @@ -350,7 +342,7 @@ def test_roc_curve_binary_classification_prediction_series(session): ) df = session.read_pandas(pd_df) - fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve( + fpr, tpr, thresholds = metrics.roc_curve( df["y_true"], df["y_score"], drop_intermediate=False ) @@ -420,7 +412,7 @@ def test_roc_auc_score_returns_expected(session): ) df = session.read_pandas(pd_df) - score = bigframes.ml.metrics.roc_auc_score( + score = metrics.roc_auc_score( df[["y_true_arbitrary_name"]], df[["y_score_arbitrary_name"]] ) @@ -436,7 +428,7 @@ def test_roc_auc_score_returns_matches_sklearn(session): ) df = session.read_pandas(pd_df) - score = bigframes.ml.metrics.roc_auc_score(df[["y_true"]], df[["y_score"]]) + score = metrics.roc_auc_score(df[["y_true"]], df[["y_score"]]) expected_score = sklearn_metrics.roc_auc_score( pd_df[["y_true"]], pd_df[["y_score"]] ) @@ -453,7 +445,7 @@ def test_roc_auc_score_series(session): ) df = session.read_pandas(pd_df) - score = bigframes.ml.metrics.roc_auc_score(df["y_true"], df["y_score"]) + score = metrics.roc_auc_score(df["y_true"], df["y_score"]) assert score == 0.625 @@ -462,33 +454,33 @@ def test_auc_invalid_x_size(session): pd_df = pd.DataFrame({"x_arbitrary_name": [0], "y_arbitrary_name": [0]}) df = session.read_pandas(pd_df) with pytest.raises(ValueError): - bigframes.ml.metrics.auc(df[["x_arbitrary_name"]], df[["y_arbitrary_name"]]) + metrics.auc(df[["x_arbitrary_name"]], df[["y_arbitrary_name"]]) def test_auc_nondecreasing_x(session): pd_df = pd.DataFrame({"x": [0, 0, 0.5, 0.5, 1], "y": [0, 0.5, 0.5, 1, 1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.auc(df[["x"]], df[["y"]]) == 0.75 + assert metrics.auc(df[["x"]], df[["y"]]) == 0.75 def test_auc_nonincreasing_x(session): pd_df = pd.DataFrame({"x": [0, 0, -0.5, -0.5, -1], "y": [0, 0.5, 0.5, 1, 1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.auc(df[["x"]], df[["y"]]) == 0.75 + assert metrics.auc(df[["x"]], df[["y"]]) == 0.75 def test_auc_nonincreasing_x_negative(session): pd_df = pd.DataFrame({"x": [0, 0, -0.5, -0.5, -1], "y": [0, -0.5, -0.5, -1, -1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.auc(df[["x"]], df[["y"]]) == -0.75 + assert metrics.auc(df[["x"]], df[["y"]]) == -0.75 def test_auc_series(session): pd_df = pd.DataFrame({"x": [0, 0, 0.5, 0.5, 1], "y": [0, 0.5, 0.5, 1, 1]}) df = session.read_pandas(pd_df) - assert bigframes.ml.metrics.auc(df["x"], df["y"]) == 0.75 + assert metrics.auc(df["x"], df["y"]) == 0.75 def test_confusion_matrix(session): @@ -499,7 +491,7 @@ def test_confusion_matrix(session): } ).astype("Int64") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix( + confusion_matrix = metrics.confusion_matrix( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]] ) expected_pd_df = pd.DataFrame( @@ -522,9 +514,7 @@ def test_confusion_matrix_column_index(session): } ).astype("Int64") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix( - df[["y_true"]], df[["y_pred"]] - ) + confusion_matrix = metrics.confusion_matrix(df[["y_true"]], df[["y_pred"]]) expected_pd_df = ( pd.DataFrame( {1: [1, 0, 1, 0], 2: [0, 0, 2, 0], 3: [0, 0, 0, 0], 4: [0, 1, 0, 1]} @@ -545,9 +535,7 @@ def test_confusion_matrix_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix( - df[["y_true"]], df[["y_pred"]] - ) + confusion_matrix = metrics.confusion_matrix(df[["y_true"]], df[["y_pred"]]) expected_confusion_matrix = sklearn_metrics.confusion_matrix( pd_df[["y_true"]], pd_df[["y_pred"]] ) @@ -565,9 +553,7 @@ def test_confusion_matrix_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix( - df[["y_true"]], df[["y_pred"]] - ) + confusion_matrix = metrics.confusion_matrix(df[["y_true"]], df[["y_pred"]]) expected_confusion_matrix = sklearn_metrics.confusion_matrix( pd_df[["y_true"]], pd_df[["y_pred"]] ) @@ -588,7 +574,7 @@ def test_confusion_matrix_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - confusion_matrix = bigframes.ml.metrics.confusion_matrix(df["y_true"], df["y_pred"]) + confusion_matrix = metrics.confusion_matrix(df["y_true"], df["y_pred"]) expected_pd_df = pd.DataFrame( { 0: [2, 0, 1], @@ -609,7 +595,7 @@ def test_recall_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - recall = bigframes.ml.metrics.recall_score( + recall = metrics.recall_score( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [1.000000, 0.000000, 0.666667] @@ -627,9 +613,7 @@ def test_recall_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - recall = bigframes.ml.metrics.recall_score( - df[["y_true"]], df[["y_pred"]], average=None - ) + recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.recall_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -646,9 +630,7 @@ def test_recall_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - recall = bigframes.ml.metrics.recall_score( - df[["y_true"]], df[["y_pred"]], average=None - ) + recall = metrics.recall_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.recall_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -665,7 +647,7 @@ def test_recall_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - recall = bigframes.ml.metrics.recall_score(df["y_true"], df["y_pred"], average=None) + recall = metrics.recall_score(df["y_true"], df["y_pred"], average=None) expected_values = [1.000000, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_recall = pd.Series(expected_values, index=expected_index) @@ -681,7 +663,7 @@ def test_precision_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - precision_score = bigframes.ml.metrics.precision_score( + precision_score = metrics.precision_score( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [0.666667, 0.000000, 0.666667] @@ -701,7 +683,7 @@ def test_precision_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - precision_score = bigframes.ml.metrics.precision_score( + precision_score = metrics.precision_score( df[["y_true"]], df[["y_pred"]], average=None ) expected_values = sklearn_metrics.precision_score( @@ -722,7 +704,7 @@ def test_precision_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - precision_score = bigframes.ml.metrics.precision_score( + precision_score = metrics.precision_score( df[["y_true"]], df[["y_pred"]], average=None ) expected_values = sklearn_metrics.precision_score( @@ -743,9 +725,7 @@ def test_precision_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - precision_score = bigframes.ml.metrics.precision_score( - df["y_true"], df["y_pred"], average=None - ) + precision_score = metrics.precision_score(df["y_true"], df["y_pred"], average=None) expected_values = [0.666667, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_precision = pd.Series(expected_values, index=expected_index) @@ -763,7 +743,7 @@ def test_f1_score(session): } ).astype("Int64") df = session.read_pandas(pd_df) - f1_score = bigframes.ml.metrics.f1_score( + f1_score = metrics.f1_score( df[["y_true_arbitrary_name"]], df[["y_pred_arbitrary_name"]], average=None ) expected_values = [0.8, 0.000000, 0.666667] @@ -781,9 +761,7 @@ def test_f1_score_matches_sklearn(session): } ).astype("Int64") df = session.read_pandas(pd_df) - f1_score = bigframes.ml.metrics.f1_score( - df[["y_true"]], df[["y_pred"]], average=None - ) + f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.f1_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -800,9 +778,7 @@ def test_f1_score_str_matches_sklearn(session): } ).astype("str") df = session.read_pandas(pd_df) - f1_score = bigframes.ml.metrics.f1_score( - df[["y_true"]], df[["y_pred"]], average=None - ) + f1_score = metrics.f1_score(df[["y_true"]], df[["y_pred"]], average=None) expected_values = sklearn_metrics.f1_score( pd_df[["y_true"]], pd_df[["y_pred"]], average=None ) @@ -819,9 +795,16 @@ def test_f1_score_series(session): } ).astype("Int64") df = session.read_pandas(pd_df) - f1_score = bigframes.ml.metrics.f1_score(df["y_true"], df["y_pred"], average=None) + f1_score = metrics.f1_score(df["y_true"], df["y_pred"], average=None) expected_values = [0.8, 0.000000, 0.666667] expected_index = [0, 1, 2] expected_f1 = pd.Series(expected_values, index=expected_index) pd.testing.assert_series_equal(f1_score, expected_f1, check_index_type=False) + + +def test_mean_squared_error(session: bigframes.Session): + pd_df = pd.DataFrame({"y_true": [3, -0.5, 2, 7], "y_pred": [2.5, 0.0, 2, 8]}) + df = session.read_pandas(pd_df) + mse = metrics.mean_squared_error(df["y_true"], df["y_pred"]) + assert mse == 0.375 diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 990795da3b..faa0cd7bbd 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -18,7 +18,7 @@ import pyarrow as pa import bigframes.features -import bigframes.ml.preprocessing +from bigframes.ml import preprocessing ONE_HOT_ENCODED_DTYPE = ( pd.ArrowDtype(pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())]))) @@ -29,7 +29,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -48,27 +48,22 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "standard_scaled_culmen_depth_mm": [0.836148, 0.024748, 0.48116], "standard_scaled_culmen_length_mm": [-0.81112, -0.994552, -1.104611], + "standard_scaled_culmen_depth_mm": [0.836148, 0.024748, 0.48116], "standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -77,27 +72,22 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): for column in result.columns: assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3) - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], "standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], + "standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.StandardScaler() + scaler = preprocessing.StandardScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -108,11 +98,6 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "standard_scaled_culmen_length_mm": [ @@ -125,12 +110,41 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + + +def test_standard_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.StandardScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.StandardScaler) + assert reloaded_transformer._bqml_model is not None + + result = reloaded_transformer.transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], + "standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], + "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -149,50 +163,40 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "max_abs_scaled_culmen_depth_mm": [0.874419, 0.8, 0.84186], "max_abs_scaled_culmen_length_mm": [0.662752, 0.645973, 0.635906], + "max_abs_scaled_culmen_depth_mm": [0.874419, 0.8, 0.84186], "max_abs_scaled_flipper_length_mm": [0.848485, 0.78355, 0.813853], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], "max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494], + "max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], "max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): - scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler = preprocessing.MaxAbsScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -203,11 +207,6 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "max_abs_scaled_culmen_length_mm": [0.662752, 0.645973, 0.635906], @@ -216,35 +215,59 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + + +def test_max_abs_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.MaxAbsScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler) + assert reloaded_transformer._bqml_model is not None + + result = reloaded_transformer.transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494], + "max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], + "max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_normalized_fit_transform(new_penguins_df): - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], "min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0], + "min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], "min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() scaler.fit(penguins_df_default_index["culmen_length_mm"]) result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() @@ -256,11 +279,6 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "min_max_scaled_culmen_length_mm": [0.269091, 0.232727, 0.210909], @@ -269,12 +287,12 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MinMaxScaler, when BQML's change is in prod. - scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler = preprocessing.MinMaxScaler() scaler.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -294,52 +312,71 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): result = scaler.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "min_max_scaled_culmen_depth_mm": [0.678571, 0.4880952, 0.595238], "min_max_scaled_culmen_length_mm": [0.269091, 0.232727, 0.210909], + "min_max_scaled_culmen_depth_mm": [0.678571, 0.4880952, 0.595238], "min_max_scaled_flipper_length_mm": [0.40678, 0.152542, 0.271186], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + + +def test_min_max_scaler_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.MinMaxScaler() + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler) + assert reloaded_transformer._bqml_model is not None + + result = reloaded_transformer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0], + "min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], + "min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") result = discretizer.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_2", "bin_4"], "kbinsdiscretizer_culmen_length_mm": ["bin_5", "bin_3", "bin_2"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_2", "bin_4"], "kbinsdiscretizer_flipper_length_mm": ["bin_5", "bin_2", "bin_4"], }, dtype="string[pyarrow]", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_series_normalizes( penguins_df_default_index, new_penguins_df ): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") discretizer.fit(penguins_df_default_index["culmen_length_mm"]) result = discretizer.transform( @@ -347,11 +384,6 @@ def test_k_bins_discretizer_series_normalizes( ).to_pandas() result = discretizer.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], @@ -360,12 +392,12 @@ def test_k_bins_discretizer_series_normalizes( index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") + discretizer = preprocessing.KBinsDiscretizer(strategy="uniform") discretizer.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -380,31 +412,24 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d result = discretizer.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_4", "bin_4"], "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_4", "bin_4"], "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"], }, dtype="string[pyarrow]", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_k_bins_discretizer_normalizes_different_params( penguins_df_default_index, new_penguins_df ): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer( - n_bins=6, strategy="uniform" - ) + discretizer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="uniform") discretizer.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -419,46 +444,67 @@ def test_k_bins_discretizer_normalizes_different_params( result = discretizer.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_4", "bin_5"], "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_4", "bin_5"], "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"], }, dtype="string[pyarrow]", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + + +def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="uniform") + transformer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.KBinsDiscretizer) + assert reloaded_transformer.n_bins == transformer.n_bins + assert reloaded_transformer.strategy == transformer.strategy + assert reloaded_transformer._bqml_model is not None + + result = reloaded_transformer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_4", "bin_2"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_5"], + "kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) def test_one_hot_encoder_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "onehotencoded_sex": [ - [{"index": 2, "value": 1.0}], + "onehotencoded_species": [ [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], ], - "onehotencoded_species": [ + "onehotencoded_sex": [ + [{"index": 2, "value": 1.0}], [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], - [{"index": 2, "value": 1.0}], ], }, dtype=ONE_HOT_ENCODED_DTYPE, @@ -469,26 +515,21 @@ def test_one_hot_encoder_default_params(new_penguins_df): def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "onehotencoded_sex": [ - [{"index": 2, "value": 1.0}], + "onehotencoded_species": [ [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], ], - "onehotencoded_species": [ + "onehotencoded_sex": [ + [{"index": 2, "value": 1.0}], [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], - [{"index": 2, "value": 1.0}], ], }, dtype=ONE_HOT_ENCODED_DTYPE, @@ -499,16 +540,11 @@ def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): def test_one_hot_encoder_series_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "onehotencoded_species": [ @@ -525,24 +561,19 @@ def test_one_hot_encoder_series_default_params(new_penguins_df): def test_one_hot_encoder_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder("most_frequent", 100, 2) + encoder = preprocessing.OneHotEncoder("most_frequent", 100, 2) encoder.fit(new_penguins_df[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { - "onehotencoded_sex": [ + "onehotencoded_species": [ [{"index": 0, "value": 1.0}], [{"index": 0, "value": 1.0}], [{"index": 0, "value": 1.0}], ], - "onehotencoded_species": [ + "onehotencoded_sex": [ [{"index": 0, "value": 1.0}], [{"index": 0, "value": 1.0}], [{"index": 0, "value": 1.0}], @@ -556,28 +587,59 @@ def test_one_hot_encoder_params(new_penguins_df): def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_df): - encoder = bigframes.ml.preprocessing.OneHotEncoder() + encoder = preprocessing.OneHotEncoder() encoder.fit(penguins_df_default_index[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], "onehotencoded_sex": [ [{"index": 3, "value": 1.0}], [{"index": 2, "value": 1.0}], [{"index": 2, "value": 1.0}], ], + }, + dtype=ONE_HOT_ENCODED_DTYPE, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_one_hot_encoder_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.OneHotEncoder(min_frequency=1, max_categories=10) + transformer.fit(new_penguins_df[["species", "sex"]]) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.OneHotEncoder) + assert reloaded_transformer.min_frequency == transformer.min_frequency + assert reloaded_transformer.max_categories == transformer.max_categories + assert reloaded_transformer._bqml_model is not None + + result = reloaded_transformer.fit_transform( + new_penguins_df[["species", "sex"]] + ).to_pandas() + + expected = pd.DataFrame( + { "onehotencoded_species": [ [{"index": 1, "value": 1.0}], [{"index": 1, "value": 1.0}], [{"index": 2, "value": 1.0}], ], + "onehotencoded_sex": [ + [{"index": 2, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + ], }, dtype=ONE_HOT_ENCODED_DTYPE, index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), @@ -587,16 +649,11 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ def test_label_encoder_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df["species"]).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ @@ -613,15 +670,10 @@ def test_label_encoder_default_params(new_penguins_df): def test_label_encoder_default_params_fit_transform(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() result = encoder.fit_transform(new_penguins_df[["species"]]).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ @@ -638,16 +690,11 @@ def test_label_encoder_default_params_fit_transform(new_penguins_df): def test_label_encoder_series_default_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(new_penguins_df["species"]) result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ @@ -664,16 +711,11 @@ def test_label_encoder_series_default_params(new_penguins_df): def test_label_encoder_params(new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder(100, 2) + encoder = preprocessing.LabelEncoder(100, 2) encoder.fit(new_penguins_df[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - expected = pd.DataFrame( { "labelencoded_species": [ @@ -690,15 +732,39 @@ def test_label_encoder_params(new_penguins_df): def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df): - encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder = preprocessing.LabelEncoder() encoder.fit(penguins_df_default_index[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) + expected = pd.DataFrame( + { + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_save_load(new_penguins_df, dataset_id): + transformer = preprocessing.LabelEncoder(min_frequency=1, max_categories=10) + transformer.fit(new_penguins_df[["species"]]) + + reloaded_transformer = transformer.to_gbq( + f"{dataset_id}.temp_configured_model", replace=True + ) + assert isinstance(reloaded_transformer, preprocessing.LabelEncoder) + assert reloaded_transformer.min_frequency == transformer.min_frequency + assert reloaded_transformer.max_categories == transformer.max_categories + assert reloaded_transformer._bqml_model is not None + + result = reloaded_transformer.transform(new_penguins_df).to_pandas() expected = pd.DataFrame( { diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index b952289a72..2824e86979 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime + import pandas as pd import pytest @@ -303,3 +305,65 @@ def test_dt_floor(scalars_dfs, col_name, freq): pd_result.astype(scalars_df[col_name].dtype), # floor preserves type bf_result, ) + + +def test_dt_compare_coerce_str_datetime(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["datetime_col"] + bf_result = (bf_series >= "2024-01-01").to_pandas() + + pd_result = scalars_pandas_df["datetime_col"] >= pd.to_datetime("2024-01-01") + + # pandas produces pyarrow bool dtype + assert_series_equal(pd_result, bf_result, check_dtype=False) + + +def test_dt_clip_datetime_literals(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["date_col"] + bf_result = bf_series.clip( + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) + ).to_pandas() + + pd_result = scalars_pandas_df["date_col"].clip( + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) + ) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_dt_clip_coerce_str_date(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["date_col"] + bf_result = bf_series.clip("2020-01-01", "2024-01-01").to_pandas() + + # Pandas can't coerce with pyarrow types so convert first + pd_result = scalars_pandas_df["date_col"].clip( + datetime.date(2020, 1, 1), datetime.date(2024, 1, 1) + ) + + assert_series_equal( + pd_result, + bf_result, + ) + + +def test_dt_clip_coerce_str_timestamp(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series: bigframes.series.Series = scalars_df["timestamp_col"] + bf_result = bf_series.clip( + "2020-01-01T20:03:50Z", "2024-01-01T20:03:50Z" + ).to_pandas() + + pd_result = scalars_pandas_df["timestamp_col"].clip( + pd.to_datetime("2020-01-01T20:03:50Z", utc=True), + pd.to_datetime("2024-01-01T20:03:50Z", utc=True), + ) + + assert_series_equal( + pd_result, + bf_result, + ) diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index 41ea7d4ebb..6542ce6de3 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -240,6 +240,48 @@ def test_scatter_args_c(c): ) +@pytest.mark.parametrize( + ("s"), + [ + pytest.param([10, 34, 50], id="int"), + pytest.param([1.0, 3.4, 5.0], id="float"), + pytest.param( + [True, True, False], id="bool", marks=pytest.mark.xfail(raises=ValueError) + ), + ], +) +def test_scatter_args_s(s): + data = { + "a": [1, 2, 3], + "b": [1, 2, 3], + } + data["s"] = s + df = bpd.DataFrame(data) + pd_df = pd.DataFrame(data) + + ax = df.plot.scatter(x="a", y="b", s="s") + pd_ax = pd_df.plot.scatter(x="a", y="b", s="s") + tm.assert_numpy_array_equal( + ax.collections[0].get_sizes(), pd_ax.collections[0].get_sizes() + ) + + +@pytest.mark.parametrize( + ("arg_name"), + [ + pytest.param("c", marks=pytest.mark.xfail(raises=NotImplementedError)), + pytest.param("s", marks=pytest.mark.xfail(raises=NotImplementedError)), + ], +) +def test_scatter_sequence_arg(arg_name): + data = { + "a": [1, 2, 3], + "b": [1, 2, 3], + } + arg_value = [3, 3, 1] + bpd.DataFrame(data).plot.scatter(x="a", y="b", **{arg_name: arg_value}) + + def test_sampling_plot_args_n(): df = bpd.DataFrame(np.arange(bf_mpl.DEFAULT_SAMPLING_N * 10), columns=["one"]) ax = df.plot.line() diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 99ee6680fa..5d6a859c11 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -20,6 +20,7 @@ from typing import Tuple import geopandas as gpd # type: ignore +import numpy as np import pandas as pd import pandas.testing import pyarrow as pa # type: ignore @@ -27,7 +28,9 @@ import bigframes import bigframes._config.display_options as display_options +import bigframes.core.indexes as bf_indexes import bigframes.dataframe as dataframe +import bigframes.pandas as bpd import bigframes.series as series from tests.system.utils import ( assert_pandas_df_equal, @@ -65,6 +68,13 @@ def test_df_construct_pandas_default(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_construct_large_strings(): + data = [["hello", "w" + "o" * 50000 + "rld"]] + bf_result = dataframe.DataFrame(data).to_pandas() + pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow")) + pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + def test_df_construct_pandas_load_job(scalars_dfs): # This should trigger the inlined codepath columns = [ @@ -494,6 +504,17 @@ def test_df_peek_force_default(scalars_dfs): assert len(peek_result) == 3 +def test_df_peek_reset_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = ( + scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) + ) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + def test_repr_w_all_rows(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs @@ -605,17 +626,24 @@ def test_assign_new_column_w_loc(scalars_dfs): pd.testing.assert_frame_equal(bf_result, pd_result) -def test_assign_new_column_w_setitem(scalars_dfs): +@pytest.mark.parametrize( + ("scalar",), + [ + (2.1,), + (None,), + ], +) +def test_assign_new_column_w_setitem(scalars_dfs, scalar): scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() pd_df = scalars_pandas_df.copy() - bf_df["new_col"] = 2 - pd_df["new_col"] = 2 + bf_df["new_col"] = scalar + pd_df["new_col"] = scalar bf_result = bf_df.to_pandas() pd_result = pd_df - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") + # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Float64") pd.testing.assert_frame_equal(bf_result, pd_result) @@ -2003,7 +2031,7 @@ def test_mod(scalars_dfs, other_scalar): def test_scalar_binop_str_exception(scalars_dfs): scalars_df, _ = scalars_dfs columns = ["string_col"] - with pytest.raises(Exception): + with pytest.raises(TypeError, match="Cannot add dtypes"): (scalars_df[columns] + 1).to_pandas() @@ -2056,6 +2084,37 @@ def test_series_binop_axis_index( assert_pandas_df_equal(bf_result, pd_result) +@skip_legacy_pandas +@pytest.mark.parametrize( + ("input"), + [ + ((1000, 2000, 3000)), + (pd.Index([1000, 2000, 3000])), + (bf_indexes.Index([1000, 2000, 3000])), + (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), + (series.Series((1000, 2000), index=["int64_too", "float64_col"])), + ], + ids=[ + "tuple", + "pd_index", + "bf_index", + "pd_series", + "bf_series", + ], +) +def test_listlike_binop_axis_1(scalars_dfs, input): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() + if hasattr(input, "to_pandas"): + input = input.to_pandas() + pd_result = scalars_pandas_df[df_columns].add(input, axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("left_labels", "right_labels"), [ @@ -3822,6 +3881,44 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +@skip_legacy_pandas +@pytest.mark.parametrize( + ("expr",), + [ + ("new_col = int64_col + int64_too",), + ("new_col = (rowindex > 3) | bool_col",), + ("int64_too = bool_col\nnew_col2 = rowindex",), + ], +) +def test_df_eval(scalars_dfs, expr): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.eval(expr).to_pandas() + pd_result = scalars_pandas_df.eval(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@skip_legacy_pandas +@pytest.mark.parametrize( + ("expr",), + [ + ("int64_col > int64_too",), + ("bool_col",), + ("((int64_col - int64_too) % @local_var) == 0",), + ], +) +def test_df_query(scalars_dfs, expr): + # local_var is referenced in expressions + local_var = 3 # NOQA + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.query(expr).to_pandas() + pd_result = scalars_pandas_df.query(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("subset", "normalize", "ascending", "dropna"), [ @@ -4048,6 +4145,56 @@ def test_recursion_limit(scalars_df_index): scalars_df_index.to_pandas() +def test_query_complexity_repeated_joins( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + pd_df = scalars_pandas_df_index + bf_df = scalars_df_index + for _ in range(6): + # recursively join, resuling in 2^6 - 1 = 63 joins + pd_df = pd_df.merge(pd_df, on="int64_col").head(30) + pd_df = pd_df[pd_df.columns[:20]] + bf_df = bf_df.merge(bf_df, on="int64_col").head(30) + bf_df = bf_df[bf_df.columns[:20]] + + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result, check_index_type=False) + + +def test_query_complexity_repeated_subtrees( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + # Recursively union the data, if fully inlined has 10^5 identical root tables. + pd_df = scalars_pandas_df_index + bf_df = scalars_df_index + for _ in range(5): + pd_df = pd.concat(10 * [pd_df]).head(5) + bf_df = bigframes.pandas.concat(10 * [bf_df]).head(5) + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.skipif( + sys.version_info >= (3, 12), + # See: https://github.com/python/cpython/issues/112282 + reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", +) +def test_query_complexity_repeated_analytic( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + bf_df = scalars_df_index[["int64_col", "int64_too"]] + pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] + # Uses LAG analytic operator, each in a new SELECT + for _ in range(50): + bf_df = bf_df.diff() + pd_df = pd_df.diff() + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result) + + def test_to_pandas_downsampling_option_override(session): df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") download_size = 1 @@ -4072,3 +4219,72 @@ def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_cre loaded_scalars_df_index = session.read_gbq(result_table) assert not loaded_scalars_df_index.empty + + +@pytest.mark.parametrize( + ("col_names", "ignore_index"), + [ + pytest.param(["A"], False, id="one_array_false"), + pytest.param(["A"], True, id="one_array_true"), + pytest.param(["B"], False, id="one_float_false"), + pytest.param(["B"], True, id="one_float_true"), + pytest.param(["A", "C"], False, id="two_arrays_false"), + pytest.param(["A", "C"], True, id="two_arrays_true"), + ], +) +def test_dataframe_explode(col_names, ignore_index): + data = { + "A": [[0, 1, 2], [], [3, 4]], + "B": 3, + "C": [["a", "b", "c"], np.nan, ["d", "e"]], + } + df = bpd.DataFrame(data) + pd_df = df.to_pandas() + pd.testing.assert_frame_equal( + df.explode(col_names, ignore_index=ignore_index).to_pandas(), + pd_df.explode(col_names, ignore_index=ignore_index), + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_dataframe_explode_reserve_order(ignore_index, ordered): + data = { + "a": [np.random.randint(0, 10, 10) for _ in range(10)], + "b": [np.random.randint(0, 10, 10) for _ in range(10)], + } + df = bpd.DataFrame(data) + pd_df = pd.DataFrame(data) + + res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered) + pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( + pd.Int64Dtype() + ) + pd.testing.assert_frame_equal( + res if ordered else res.sort_index(), + pd_res, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("col_names"), + [ + pytest.param([], id="empty", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param( + ["A", "A"], id="duplicate", marks=pytest.mark.xfail(raises=ValueError) + ), + pytest.param("unknown", id="unknown", marks=pytest.mark.xfail(raises=KeyError)), + ], +) +def test_dataframe_explode_xfail(col_names): + df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) + df.explode(col_names) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index e7ecbedfc2..ba79ba1ab1 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -228,8 +228,7 @@ def test_dataframe_groupby_multi_sum( (lambda x: x.cumsum(numeric_only=True)), (lambda x: x.cummax(numeric_only=True)), (lambda x: x.cummin(numeric_only=True)), - # pandas 2.2 uses floating point for cumulative product even for - # integer inputs. + # Pre-pandas 2.2 doesn't always proeduce float. (lambda x: x.cumprod().astype("Float64")), (lambda x: x.shift(periods=2)), ], diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 1f39ba25fe..c419dc4907 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -370,3 +370,42 @@ def test_index_isin(scalars_df_index, scalars_pandas_df_index): bf_series, check_names=False, ) + + +def test_multiindex_name_is_none(session): + df = pd.DataFrame( + { + "A": [0, 0, 0, 1, 1, 1], + "B": ["x", "y", "z", "x", "y", "z"], + "C": [123, 345, 789, -123, -345, -789], + "D": ["a", "b", "c", "d", "e", "f"], + }, + ) + index = session.read_pandas(df).set_index(["A", "B"]).index + assert index.name is None + + +def test_multiindex_names_not_none(session): + df = pd.DataFrame( + { + "A": [0, 0, 0, 1, 1, 1], + "B": ["x", "y", "z", "x", "y", "z"], + "C": [123, 345, 789, -123, -345, -789], + "D": ["a", "b", "c", "d", "e", "f"], + }, + ) + index = session.read_pandas(df).set_index(["A", "B"]).index + assert tuple(index.names) == ("A", "B") + + +def test_multiindex_repr_includes_all_names(session): + df = pd.DataFrame( + { + "A": [0, 0, 0, 1, 1, 1], + "B": ["x", "y", "z", "x", "y", "z"], + "C": [123, 345, 789, -123, -345, -789], + "D": ["a", "b", "c", "d", "e", "f"], + }, + ) + index = session.read_pandas(df).set_index(["A", "B"]).index + assert "names=['A', 'B']" in repr(index) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index d585d4f73e..6aca7628cf 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import pandas import pytest @@ -1168,3 +1169,19 @@ def test_column_multi_index_dot_not_supported(): NotImplementedError, match="Multi-level column input is not supported" ): bf1 @ bf2 + + +def test_explode_w_multi_index(): + data = [[[1, 1], np.nan, [3, 3]], [[2], [5], []]] + multi_level_columns = pandas.MultiIndex.from_arrays( + [["col0", "col0", "col1"], ["col00", "col01", "col11"]] + ) + + df = bpd.DataFrame(data, columns=multi_level_columns) + pd_df = df.to_pandas() + pandas.testing.assert_frame_equal( + df["col0"].explode("col00").to_pandas(), + pd_df["col0"].explode("col00"), + check_dtype=False, + check_index_type=False, + ) diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py index 5c2a93ec39..8e349e472a 100644 --- a/tests/system/small/test_numpy.py +++ b/tests/system/small/test_numpy.py @@ -56,6 +56,10 @@ def test_series_ufuncs(floats_pd, floats_bf, opname): ("log10",), ("sqrt",), ("abs",), + ("floor",), + ("ceil",), + ("expm1",), + ("log1p",), ], ) def test_df_ufuncs(scalars_dfs, opname): @@ -77,6 +81,7 @@ def test_df_ufuncs(scalars_dfs, opname): ("multiply",), ("divide",), ("power",), + ("arctan2",), ], ) def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname): @@ -112,6 +117,23 @@ def test_df_binary_ufuncs(scalars_dfs, opname): pd.testing.assert_frame_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("x", "y"), + [ + ("int64_col", "int64_col"), + ("float64_col", "int64_col"), + ], +) +def test_series_atan2(scalars_dfs, x, y): + # Test atan2 separately as pandas errors when passing entire df as input, so pass only series + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = np.arctan2(scalars_df[x], scalars_df[y]).to_pandas() + pd_result = np.arctan2(scalars_pandas_df[x], scalars_pandas_df[y]) + + pd.testing.assert_series_equal(bf_result, pd_result) + + def test_series_binary_ufuncs_reverse(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 1c04b580fc..ea139b9802 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -20,7 +20,7 @@ import bigframes as bf import bigframes.formatting_helpers as formatting_helpers -from bigframes.session import MAX_INLINE_DF_SIZE +from bigframes.session import MAX_INLINE_DF_BYTES job_load_message_regex = r"\w+ job [\w-]+ is \w+\." @@ -70,7 +70,7 @@ def test_progress_bar_load_jobs( ): # repeat the DF to be big enough to trigger the load job. df = penguins_pandas_df_default_index - while len(df) < MAX_INLINE_DF_SIZE: + while len(df) < MAX_INLINE_DF_BYTES: df = pd.DataFrame(np.repeat(df.values, 2, axis=0)) bf.options.display.progress_bar = "terminal" diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 8847753e88..e350286940 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1481,7 +1481,7 @@ def test_groupby_prod(scalars_dfs): bf_series = scalars_df[col_name].groupby(scalars_df["int64_col"]).prod() pd_series = ( scalars_pandas_df[col_name].groupby(scalars_pandas_df["int64_col"]).prod() - ) + ).astype(pd.Float64Dtype()) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() assert_series_equal( @@ -1529,10 +1529,16 @@ def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator) ) -def test_drop_label(scalars_df_index, scalars_pandas_df_index): - col_name = "int64_col" - bf_series = scalars_df_index[col_name].drop(1).to_pandas() - pd_series = scalars_pandas_df_index[col_name].drop(1) +@pytest.mark.parametrize( + ("label", "col_name"), + [ + (0, "bool_col"), + (1, "int64_col"), + ], +) +def test_drop_label(scalars_df_index, scalars_pandas_df_index, label, col_name): + bf_series = scalars_df_index[col_name].drop(label).to_pandas() + pd_series = scalars_pandas_df_index[col_name].drop(label) pd.testing.assert_series_equal( pd_series, bf_series, @@ -2783,6 +2789,12 @@ def test_string_astype_float(): def test_string_astype_date(): + if int(pa.__version__.split(".")[0]) < 15: + pytest.skip( + "Avoid pyarrow.lib.ArrowNotImplementedError: " + "Unsupported cast from string to date32 using function cast_date32." + ) + pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype( pd.ArrowDtype(pa.string()) ) @@ -3410,3 +3422,104 @@ def foo(x: int, y: int, df): ) assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param([1, 2, 3], id="int"), + pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int_array"), + pytest.param( + [["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string_array" + ), + pytest.param( + [ + {"A": {"x": 1.0}, "B": "b"}, + {"A": {"y": 2.0}, "B": "bb"}, + {"A": {"z": 4.0}}, + {}, + numpy.nan, + ], + id="struct_array", + ), + ], +) +def test_series_explode(data): + data = [[1, 2, 3], [], numpy.nan, [3, 4]] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + pd.testing.assert_series_equal( + s.explode().to_pandas(), + pd_s.explode(), + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("index", "ignore_index"), + [ + pytest.param(None, True, id="default_index"), + pytest.param(None, False, id="ignore_default_index"), + pytest.param([5, 1, 3, 2], True, id="unordered_index"), + pytest.param([5, 1, 3, 2], False, id="ignore_unordered_index"), + pytest.param(["z", "x", "a", "b"], True, id="str_index"), + pytest.param(["z", "x", "a", "b"], False, id="ignore_str_index"), + ], +) +def test_series_explode_w_index(index, ignore_index): + data = [[], [200.0, 23.12], [4.5, -9.0], [1.0]] + s = bigframes.pandas.Series(data, index=index) + pd_s = pd.Series(data, index=index) + pd.testing.assert_series_equal( + s.explode(ignore_index=ignore_index).to_pandas(), + pd_s.explode(ignore_index=ignore_index).astype(pd.Float64Dtype()), + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_series_explode_reserve_order(ignore_index, ordered): + data = [numpy.random.randint(0, 10, 10) for _ in range(10)] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + + res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered) + pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()) + pd.testing.assert_series_equal( + res if ordered else res.sort_index(), + pd_res, + check_index_type=False, + ) + + +def test_series_explode_w_aggregate(): + data = [[1, 2, 3], [], numpy.nan, [3, 4]] + s = bigframes.pandas.Series(data) + pd_s = pd.Series(data) + assert s.explode().sum() == pd_s.explode().sum() + + +@pytest.mark.parametrize( + ("data"), + [ + pytest.param(numpy.nan, id="null"), + pytest.param([numpy.nan], id="null_array"), + pytest.param([[]], id="empty_array"), + pytest.param([numpy.nan, []], id="null_and_empty_array"), + ], +) +def test_series_explode_null(data): + s = bigframes.pandas.Series(data) + pd.testing.assert_series_equal( + s.explode().to_pandas(), + s.to_pandas().explode(), + check_dtype=False, + ) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index d0c20f3839..eb6a0a8dd9 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -27,7 +27,7 @@ import pytest import bigframes -import bigframes.core.indexes.index +import bigframes.core.indexes.base import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model @@ -421,6 +421,21 @@ def test_read_pandas(session, scalars_dfs): pd.testing.assert_frame_equal(result, expected) +def test_read_pandas_series(session): + idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) + pd_series = pd.Series([3, 1, 4, 1, 5], dtype=pd.Int64Dtype(), index=idx) + bf_series = session.read_pandas(pd_series) + + pd.testing.assert_series_equal(bf_series.to_pandas(), pd_series) + + +def test_read_pandas_index(session): + pd_idx = pd.Index([2, 7, 1, 2, 8], dtype=pd.Int64Dtype()) + bf_idx = session.read_pandas(pd_idx) + + pd.testing.assert_index_equal(bf_idx.to_pandas(), pd_idx) + + def test_read_pandas_inline_respects_location(): options = bigframes.BigQueryOptions(location="europe-west1") session = bigframes.Session(options) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 913bab0379..5b1ff37775 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -373,17 +373,17 @@ def test_ml_generate_text_correct( ) -def test_ml_generate_text_embedding_correct( +def test_ml_generate_embedding_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): - sql = model_manipulation_sql_generator.ml_generate_text_embedding( + sql = model_manipulation_sql_generator.ml_generate_embedding( source_df=mock_df, struct_options={"option_key1": 1, "option_key2": 2.2}, ) assert ( sql - == """SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `my_project_id.my_dataset_id.my_model_id`, (input_X_sql), STRUCT( 1 AS option_key1, 2.2 AS option_key2))""" diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index 3f89feaa34..88826b31ce 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -26,11 +26,17 @@ def _to_json_string(translator, op: vendored_ibis_ops.ToJsonString): return f"TO_JSON_STRING({arg})" +def _generate_array(translator, op: vendored_ibis_ops.GenerateArray): + arg = translator.translate(op.arg) + return f"GENERATE_ARRAY(0, {arg})" + + patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore + vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py index 2c2efe528d..3d5a5a7fa0 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -2,5 +2,6 @@ from __future__ import annotations from bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403 +from bigframes_vendored.ibis.expr.operations.generic import * # noqa: F401 F403 from bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403 from bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403 diff --git a/third_party/bigframes_vendored/ibis/expr/operations/generic.py b/third_party/bigframes_vendored/ibis/expr/operations/generic.py new file mode 100644 index 0000000000..82d0a13371 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/expr/operations/generic.py @@ -0,0 +1,9 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/generic.py +from __future__ import annotations + +import ibis.expr.datatypes as dt +from ibis.expr.operations.core import Unary + + +class GenerateArray(Unary): + dtype = dt.Array(dt.int64) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py index bd5f055ece..ce5f8d55f3 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/datetimelike.py @@ -26,7 +26,7 @@ def strftime(self, date_format: str): 0 August 15, 2014, 08:15:12 AM 1 February 29, 2012, 02:15:12 AM 2 August 15, 2015, 03:15:12 AM - Name: 0, dtype: string + dtype: string Args: date_format (str): diff --git a/third_party/bigframes_vendored/pandas/core/common.py b/third_party/bigframes_vendored/pandas/core/common.py index ded5a22b8f..872a64db6c 100644 --- a/third_party/bigframes_vendored/pandas/core/common.py +++ b/third_party/bigframes_vendored/pandas/core/common.py @@ -3,6 +3,8 @@ from typing import Callable, TYPE_CHECKING +from bigframes_vendored.pandas.core.dtypes.inference import iterable_not_string + if TYPE_CHECKING: from bigframes_vendored.pandas.pandas._typing import T @@ -40,3 +42,27 @@ def pipe( return func(*args, **kwargs) else: return func(obj, *args, **kwargs) + + +def flatten(line): + """ + Flatten an arbitrarily nested sequence. + + Parameters + ---------- + line : sequence + The non string sequence to flatten + + Notes + ----- + This doesn't consider strings sequences. + + Returns + ------- + flattened : generator + """ + for element in line: + if iterable_not_string(element): + yield from flatten(element) + else: + yield element diff --git a/third_party/bigframes_vendored/pandas/core/computation/align.py b/third_party/bigframes_vendored/pandas/core/computation/align.py new file mode 100644 index 0000000000..2608dabe7a --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/align.py @@ -0,0 +1,226 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/align.py +""" +Core eval alignment algorithms. +""" +from __future__ import annotations + +from functools import partial, wraps +from typing import Callable, TYPE_CHECKING +import warnings + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.common import result_type_many +from bigframes_vendored.pandas.util._exceptions import find_stack_level +import numpy as np +from pandas.errors import PerformanceWarning + +if TYPE_CHECKING: + from collections.abc import Sequence + + from bigframes_vendored.pandas.core.generic import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + from pandas._typing import F + + +def _align_core_single_unary_op( + term, +) -> tuple[partial | type[NDFrame], dict[str, Index] | None]: + typ: partial | type[NDFrame] + axes: dict[str, Index] | None = None + + if isinstance(term.value, np.ndarray): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) + + return typ, axes + + +def _zip_axes_from_type( + typ: type[NDFrame], new_axes: Sequence[Index] +) -> dict[str, Index]: + return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} + + +def _any_pandas_objects(terms) -> bool: + """ + Check a sequence of terms for instances of PandasObject. + """ + return any(is_pandas_object(term.value) for term in terms) + + +def _filter_special_cases(f) -> Callable[[F], F]: + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + term_values = (term.value for term in terms) + + # we don't have any pandas objects + if not _any_pandas_objects(terms): + return result_type_many(*term_values), None + + return f(terms) + + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] + term_dims = [terms[i].value.ndim for i in term_index] + + from pandas import Series + + ndims = Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + gt_than_one_axis = naxes > 1 + + for value in (terms[i].value for i in term_index): + value_is_series = is_series(value) + is_series_and_gt_one_axis = value_is_series and gt_than_one_axis + + for axis, items in enumerate(value.axes): + if is_series_and_gt_one_axis: + ax, itm = naxes - 1, value.index + else: + ax, itm = axis, items + + if not axes[ax].is_(itm): + axes[ax] = axes[ax].join(itm, how="outer") + + for i, ndim in ndims.items(): + for axis, items in zip(range(ndim), axes): + ti = terms[i].value + + if hasattr(ti, "reindex"): + transpose = value_is_series(ti) and naxes > 1 + reindexer = axes[naxes - 1] if transpose else items + + term_axis_size = len(ti.axes[axis]) + reindexer_size = len(reindexer) + + ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) + if ordm >= 1 and reindexer_size >= 10000: + w = ( + f"Alignment difference on axis {axis} is larger " + f"than an order of magnitude on term {repr(terms[i].name)}, " + f"by more than {ordm:.4g}; performance may suffer." + ) + warnings.warn( + w, category=PerformanceWarning, stacklevel=find_stack_level() + ) + + obj = ti.reindex(reindexer, axis=axis, copy=False) + terms[i].update(obj) + + terms[i].update(terms[i].value.values) + + return typ, _zip_axes_from_type(typ, axes) + + +def align_terms(terms): + """ + Align a set of terms. + """ + try: + # flatten the parse tree (a nested list, really) + terms = list(com.flatten(terms)) + except TypeError: + # can't iterate so it must just be a constant or single variable + if is_series_or_dataframe(terms.value): + typ = type(terms.value) + return typ, _zip_axes_from_type(typ, terms.value.axes) + return np.result_type(terms.type), None + + # if all resolved variables are numeric scalars + if all(term.is_scalar for term in terms): + return result_type_many(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def reconstruct_object(typ, obj, axes, dtype): + """ + Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + ret : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + typ = typ.type + except AttributeError: + pass + + res_t = np.result_type(obj.dtype, dtype) + + if not isinstance(typ, partial) and is_pandas_type(typ): + return typ(obj, dtype=res_t, **axes) + + # special case for pathological things like ~True/~False + if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: + ret_value = res_t.type(obj) + else: + ret_value = typ(obj).astype(res_t) + # The condition is to distinguish 0-dim array (returned in case of + # scalar) and 1 element array + # e.g. np.array(0) and np.array([0]) + if ( + len(obj.shape) == 1 + and len(obj) == 1 + and not isinstance(ret_value, np.ndarray) + ): + ret_value = np.array([ret_value]).astype(res_t) + + return ret_value + + +# Custom to recognize BigFrames types +def is_series(obj) -> bool: + from bigframes_vendored.pandas.core.series import Series + + return isinstance(obj, Series) + + +def is_series_or_dataframe(obj) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + + return isinstance(obj, NDFrame) + + +def is_pandas_object(obj) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + + return isinstance(obj, NDFrame) or isinstance(obj, Index) + + +def is_pandas_type(type) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + + return issubclass(type, NDFrame) or issubclass(type, Index) diff --git a/third_party/bigframes_vendored/pandas/core/computation/common.py b/third_party/bigframes_vendored/pandas/core/computation/common.py new file mode 100644 index 0000000000..7775489d0d --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/common.py @@ -0,0 +1,48 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/common.py +from __future__ import annotations + +from functools import reduce + +import numpy as np +from pandas._config import get_option + + +def ensure_decoded(s) -> str: + """ + If we have bytes, decode them to unicode. + """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode(get_option("display.encoding")) + return s + + +def result_type_many(*arrays_and_dtypes): + """ + Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) + argument limit. + """ + try: + return np.result_type(*arrays_and_dtypes) + except ValueError: + # we have > NPY_MAXARGS terms in our expression + return reduce(np.result_type, arrays_and_dtypes) + except TypeError: + from pandas.core.dtypes.cast import find_common_type + from pandas.core.dtypes.common import is_extension_array_dtype + + arr_and_dtypes = list(arrays_and_dtypes) + ea_dtypes, non_ea_dtypes = [], [] + for arr_or_dtype in arr_and_dtypes: + if is_extension_array_dtype(arr_or_dtype): + ea_dtypes.append(arr_or_dtype) + else: + non_ea_dtypes.append(arr_or_dtype) + + if non_ea_dtypes: + try: + np_dtype = np.result_type(*non_ea_dtypes) + except ValueError: + np_dtype = reduce(np.result_type, arrays_and_dtypes) + return find_common_type(ea_dtypes + [np_dtype]) + + return find_common_type(ea_dtypes) diff --git a/third_party/bigframes_vendored/pandas/core/computation/engines.py b/third_party/bigframes_vendored/pandas/core/computation/engines.py new file mode 100644 index 0000000000..15fd48b237 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/engines.py @@ -0,0 +1,94 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/engines.py +""" +Engine classes for :func:`~pandas.eval` +""" +from __future__ import annotations + +import abc + +from bigframes_vendored.pandas.core.computation.align import ( + align_terms, + reconstruct_object, +) +from pandas.io.formats import printing + + +class AbstractEngine(metaclass=abc.ABCMeta): + """Object serving as a base class for all engines.""" + + has_neg_frac = False + + def __init__(self, expr) -> None: + self.expr = expr + self.aligned_axes = None + self.result_type = None + + def convert(self) -> str: + """ + Convert an expression for evaluation. + + Defaults to return the expression as a string. + """ + return printing.pprint_thing(self.expr) + + def evaluate(self) -> object: + """ + Run the engine on the expression. + + This method performs alignment which is necessary no matter what engine + is being used, thus its implementation is in the base class. + + Returns + ------- + object + The result of the passed expression. + """ + if not self._is_aligned: + self.result_type, self.aligned_axes = align_terms(self.expr.terms) + + # make sure no names in resolvers and locals/globals clash + res = self._evaluate() + return reconstruct_object( + self.result_type, res, self.aligned_axes, self.expr.terms.return_type + ) + + @property + def _is_aligned(self) -> bool: + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self): + """ + Return an evaluated expression. + + Parameters + ---------- + env : Scope + The local and global environment in which to evaluate an + expression. + + Notes + ----- + Must be implemented by subclasses. + """ + + +class PythonEngine(AbstractEngine): + """ + Evaluate an expression in Python space. + + Mostly for testing purposes. + """ + + has_neg_frac = False + + def evaluate(self): + return self.expr() + + def _evaluate(self) -> None: + pass + + +ENGINES: dict[str, type[AbstractEngine]] = { + "python": PythonEngine, +} diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py new file mode 100644 index 0000000000..56d60174a6 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -0,0 +1,368 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/eval.py +""" +Top level ``eval`` module. +""" +from __future__ import annotations + +import tokenize +from typing import TYPE_CHECKING +import warnings + +from bigframes_vendored.pandas.core.computation.engines import ENGINES +from bigframes_vendored.pandas.core.computation.expr import Expr, PARSERS +from bigframes_vendored.pandas.core.computation.parsing import tokenize_string +from bigframes_vendored.pandas.core.computation.scope import ensure_scope +from bigframes_vendored.pandas.core.generic import NDFrame +from bigframes_vendored.pandas.util._validators import validate_bool_kwarg +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from pandas.core.computation.ops import BinOp + + +def _check_engine(engine: str | None) -> str: + """ + Make sure a valid engine is passed. + + Parameters + ---------- + engine : str + String to validate. + + Raises + ------ + KeyError + * If an invalid engine is passed. + + Returns + ------- + str + Engine name. + """ + + if engine is None: + engine = "python" + + if engine not in ENGINES: + valid_engines = list(ENGINES.keys()) + raise KeyError( + f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" + ) + + return engine + + +def _check_parser(parser: str): + """ + Make sure a valid parser is passed. + + Parameters + ---------- + parser : str + + Raises + ------ + KeyError + * If an invalid parser is passed + """ + if parser not in PARSERS: + raise KeyError( + f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}" + ) + + +def _check_resolvers(resolvers): + if resolvers is not None: + for resolver in resolvers: + if not hasattr(resolver, "__getitem__"): + name = type(resolver).__name__ + raise TypeError( + f"Resolver of type '{name}' does not " + "implement the __getitem__ method" + ) + + +def _check_expression(expr): + """ + Make sure an expression is not an empty string + + Parameters + ---------- + expr : object + An object that can be converted to a string + + Raises + ------ + ValueError + * If expr is an empty string + """ + if not expr: + raise ValueError("expr cannot be an empty string") + + +def _convert_expression(expr) -> str: + """ + Convert an object to an expression. + + This function converts an object to an expression (a unicode string) and + checks to make sure it isn't empty after conversion. This is used to + convert operators to their string representation for recursive calls to + :func:`~pandas.eval`. + + Parameters + ---------- + expr : object + The object to be converted to a string. + + Returns + ------- + str + The string representation of an object. + + Raises + ------ + ValueError + * If the expression is empty. + """ + s = pprint_thing(expr) + _check_expression(s) + return s + + +def _check_for_locals(expr: str, stack_level: int, parser: str): + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != "pandas" + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ( + "The '@' prefix is not allowed in top-level eval calls.\n" + "please refer to your variables by name without the '@' prefix." + ) + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval in tokenize_string(expr): + if toknum == tokenize.OP and tokval == "@": + raise SyntaxError(msg) + + +def eval( + expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users + parser: str = "pandas", + engine: str | None = None, + local_dict=None, + global_dict=None, + resolvers=(), + level: int = 0, + target=None, + inplace: bool = False, +): + """ + Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) + >>> df + animal age + 0 dog 10 + 1 pig 20 + + [2 rows x 2 columns] + + We can add a new column using ``pd.eval``: + + >>> df.eval("double_age = age * 2") + animal age double_age + 0 dog 10 20 + 1 pig 20 40 + + [2 rows x 3 columns] + + Args: + expr (str): + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + parser ({'pandas', 'python'}, default 'pandas'): + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine ({'python'}, default None): + + The engine used to evaluate the expression. Supported engines are + + - None : defaults to ``python`` + - ``'python'`` : Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + local_dict (dict or None, optional): + A dictionary of local variables, taken from locals() by default. + global_dict (dict or None, optional): + A dictionary of global variables, taken from globals() by default. + resolvers (list of dict-like or None, optional): + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~DataFrame.query` method to inject the + ``DataFrame.index`` and ``DataFrame.columns`` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level (int, optional): + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + target (object, optional, default None): + This is the target object for assignment. It is used when there is + variable assignment in the expression. If so, then `target` must + support item assignment with string keys, and if a copy is being + returned, it must also support `.copy()`. + inplace (bool, default False): + If `target` is provided, and the expression mutates `target`, whether + to modify `target` inplace. Otherwise, return a copy of `target` with + the mutation. + + Returns: + ndarray, numeric scalar, DataFrame, Series, or None: + The completion value of evaluating the given code or None if ``inplace=True``. + + Raises: + ValueError: + There are many instances where such an error can be raised: + + - `target=None`, but the expression is multiline. + - The expression is multiline, but not all them have item assignment. + An example of such an arrangement is this: + + a = b + 1 + a + 2 + + Here, there are expressions on different lines, making it multiline, + but the last line has no variable assigned to the output of `a + 2`. + - `inplace=True`, but the expression is missing item assignment. + - Item assignment is provided, but the `target` does not support + string item assignment. + - Item assignment is provided and `inplace=False`, but the `target` + does not support the `.copy()` method + + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + exprs: list[str | BinOp] + if isinstance(expr, str): + _check_expression(expr) + exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] + else: + # ops.BinOp; for internal compat, not intended to be passed by users + exprs = [expr] + multi_line = len(exprs) > 1 + + if multi_line and target is None: + raise ValueError( + "multi-line expressions are only valid in the " + "context of data, use DataFrame.eval" + ) + engine = _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) + + ret = None + first_expr = True + target_modified = False + + for expr in exprs: + expr = _convert_expression(expr) + _check_for_locals(expr, level, parser) + + # get our (possibly passed-in) scope + env = ensure_scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) + + # construct the engine and evaluate the parsed expression + eng = ENGINES[engine] + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() + + if parsed_expr.assigner is None: + if multi_line: + raise ValueError( + "Multi-line expressions are only valid " + "if all expressions contain an assignment" + ) + if inplace: + raise ValueError("Cannot operate inplace if there is no assignment") + + # assign if needed + assigner = parsed_expr.assigner + if env.target is not None and assigner is not None: + target_modified = True + + # if returning a copy, copy only on the first assignment + if not inplace and first_expr: + try: + target = env.target + if isinstance(target, NDFrame): + target = target.copy() + except AttributeError as err: + raise ValueError("Cannot return a copy of the target") from err + else: + target = env.target + + # TypeError is most commonly raised (e.g. int, list), but you + # get IndexError if you try to do this assignment on np.ndarray. + # we will ignore numpy warnings here; e.g. if trying + # to use a non-numeric indexer + try: + with warnings.catch_warnings(record=True): + # TODO: Filter the warnings we actually care about here. + if inplace and isinstance(target, NDFrame): + target.loc[:, assigner] = ret + else: + target[ # pyright: ignore[reportGeneralTypeIssues] + assigner + ] = ret + except (TypeError, IndexError) as err: + raise ValueError("Cannot assign expression output to target") from err + + if not resolvers: + resolvers = ({assigner: ret},) + else: + # existing resolver needs updated to handle + # case of mutating existing column in copy + for resolver in resolvers: + if assigner in resolver: + resolver[assigner] = ret + break + else: + resolvers += ({assigner: ret},) + + ret = None + first_expr = False + + # We want to exclude `inplace=None` as being False. + if inplace is False: + return target if target_modified else ret diff --git a/third_party/bigframes_vendored/pandas/core/computation/expr.py b/third_party/bigframes_vendored/pandas/core/computation/expr.py new file mode 100644 index 0000000000..44f649e59d --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/expr.py @@ -0,0 +1,828 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/expr.py +""" +:func:`~pandas.eval` parsers. +""" +from __future__ import annotations + +import ast +from functools import partial, reduce +from keyword import iskeyword +import tokenize +from typing import Callable, TypeVar + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.ops import ( + ARITH_OPS_SYMS, + BinOp, + BOOL_OPS_SYMS, + CMP_OPS_SYMS, + Constant, + Div, + FuncNode, + is_term, + LOCAL_TAG, + Op, + Term, + UNARY_OPS_SYMS, + UnaryOp, +) +from bigframes_vendored.pandas.core.computation.parsing import ( + clean_backtick_quoted_toks, + tokenize_string, +) +from bigframes_vendored.pandas.core.computation.scope import Scope +import numpy as np +from pandas.errors import UndefinedVariableError +from pandas.io.formats import printing + + +def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: + """ + Rewrite the assignment operator for PyTables expressions that use ``=`` + as a substitute for ``==``. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + return toknum, "==" if tokval == "=" else tokval + + +def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]: + """ + Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == tokenize.OP: + if tokval == "&": + return tokenize.NAME, "and" + elif tokval == "|": + return tokenize.NAME, "or" + return toknum, tokval + return toknum, tokval + + +def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]: + """ + Replace local variables with a syntactically valid name. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + + Notes + ----- + This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as + ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` + is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. + """ + toknum, tokval = tok + if toknum == tokenize.OP and tokval == "@": + return tokenize.OP, LOCAL_TAG + return toknum, tokval + + +def _compose2(f, g): + """ + Compose 2 callables. + """ + return lambda *args, **kwargs: f(g(*args, **kwargs)) + + +def _compose(*funcs): + """ + Compose 2 or more callables. + """ + assert len(funcs) > 1, "At least 2 callables must be passed to compose" + return reduce(_compose2, funcs) + + +def _preparse( + source: str, + f=_compose( + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks + ), +) -> str: + """ + Compose a collection of tokenization functions. + + Parameters + ---------- + source : str + A Python source code string + f : callable + This takes a tuple of (toknum, tokval) as its argument and returns a + tuple with the same structure but possibly different elements. Defaults + to the composition of ``_rewrite_assign``, ``_replace_booleans``, and + ``_replace_locals``. + + Returns + ------- + str + Valid Python source code + + Notes + ----- + The `f` parameter can be any callable that takes *and* returns input of the + form ``(toknum, tokval)``, where ``toknum`` is one of the constants from + the ``tokenize`` module and ``tokval`` is a string. + """ + assert callable(f), "f must be callable" + return tokenize.untokenize(f(x) for x in tokenize_string(source)) + + +def _is_type(t): + """ + Factory for a type checking function of type ``t`` or tuple of types. + """ + return lambda x: isinstance(x.value, t) + + +_is_list = _is_type(list) +_is_str = _is_type(str) + + +# partition all AST nodes +_all_nodes = frozenset( + node + for node in (getattr(ast, name) for name in dir(ast)) + if isinstance(node, type) and issubclass(node, ast.AST) +) + + +def _filter_nodes(superclass, all_nodes=_all_nodes): + """ + Filter out AST nodes that are subclasses of ``superclass``. + """ + node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) + return frozenset(node_names) + + +_all_node_names = frozenset(x.__name__ for x in _all_nodes) +_mod_nodes = _filter_nodes(ast.mod) +_stmt_nodes = _filter_nodes(ast.stmt) +_expr_nodes = _filter_nodes(ast.expr) +_expr_context_nodes = _filter_nodes(ast.expr_context) +_boolop_nodes = _filter_nodes(ast.boolop) +_operator_nodes = _filter_nodes(ast.operator) +_unary_op_nodes = _filter_nodes(ast.unaryop) +_cmp_op_nodes = _filter_nodes(ast.cmpop) +_comprehension_nodes = _filter_nodes(ast.comprehension) +_handler_nodes = _filter_nodes(ast.excepthandler) +_arguments_nodes = _filter_nodes(ast.arguments) +_keyword_nodes = _filter_nodes(ast.keyword) +_alias_nodes = _filter_nodes(ast.alias) + + +# nodes that we don't support directly but are needed for parsing +_hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + + +_unsupported_expr_nodes = frozenset( + [ + "Yield", + "GeneratorExp", + "IfExp", + "DictComp", + "SetComp", + "Repr", + "Lambda", + "Set", + "AST", + "Is", + "IsNot", + ] +) + +# these nodes are low priority or won't ever be supported (e.g., AST) +_unsupported_nodes = ( + _stmt_nodes + | _mod_nodes + | _handler_nodes + | _arguments_nodes + | _keyword_nodes + | _alias_nodes + | _expr_context_nodes + | _unsupported_expr_nodes +) - _hacked_nodes + +# we're adding a different assignment in some cases to be equality comparison +# and we don't want `stmt` and friends in their so get only the class whose +# names are capitalized +_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes +intersection = _unsupported_nodes & _base_supported_nodes +_msg = f"cannot both support and not support {intersection}" +assert not intersection, _msg + + +def _node_not_implemented(node_name: str) -> Callable[..., None]: + """ + Return a function that raises a NotImplementedError with a passed node name. + """ + + def f(self, *args, **kwargs): + raise NotImplementedError(f"'{node_name}' nodes are not implemented") + + return f + + +# should be bound by BaseExprVisitor but that creates a circular dependency: +# _T is used in disallow, but disallow is used to define BaseExprVisitor +# https://github.com/microsoft/pyright/issues/2315 +_T = TypeVar("_T") + + +def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]: + """ + Decorator to disallow certain nodes from parsing. Raises a + NotImplementedError instead. + + Returns + ------- + callable + """ + + def disallowed(cls: type[_T]) -> type[_T]: + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes = () # type: ignore[attr-defined] + for node in nodes: + new_method = _node_not_implemented(node) + name = f"visit_{node}" + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes += (name,) # type: ignore[attr-defined] + setattr(cls, name, new_method) + return cls + + return disallowed + + +def _op_maker(op_class, op_symbol): + """ + Return a function to create an op class with its symbol already passed. + + Returns + ------- + callable + """ + + def f(self, node, *args, **kwargs): + """ + Return a partial function with an Op subclass with an operator already passed. + + Returns + ------- + callable + """ + return partial(op_class, op_symbol, *args, **kwargs) + + return f + + +_op_classes = {"binary": BinOp, "unary": UnaryOp} + + +def add_ops(op_classes): + """ + Decorator to add default implementation of ops. + """ + + def f(cls): + for op_attr_name, op_class in op_classes.items(): + ops = getattr(cls, f"{op_attr_name}_ops") + ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map") + for op in ops: + op_node = ops_map[op] + if op_node is not None: + made_op = _op_maker(op_class, op) + setattr(cls, f"visit_{op_node}", made_op) + return cls + + return f + + +@disallow(_unsupported_nodes) +@add_ops(_op_classes) +class BaseExprVisitor(ast.NodeVisitor): + """ + Custom ast walker. Parsers of other engines should subclass this class + if necessary. + + Parameters + ---------- + env : Scope + engine : str + parser : str + preparser : callable + """ + + const_type: type[Term] = Constant + term_type = Term + + binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS + binary_op_nodes = ( + "Gt", + "Lt", + "GtE", + "LtE", + "Eq", + "NotEq", + "In", + "NotIn", + "BitAnd", + "BitOr", + "And", + "Or", + "Add", + "Sub", + "Mult", + None, + "Pow", + "FloorDiv", + "Mod", + ) + binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) + + unary_ops = UNARY_OPS_SYMS + unary_op_nodes = "UAdd", "USub", "Invert", "Not" + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + rewrite_map = { + ast.Eq: ast.In, + ast.NotEq: ast.NotIn, + ast.In: ast.In, + ast.NotIn: ast.NotIn, + } + + unsupported_nodes: tuple[str, ...] + + def __init__(self, env, engine, parser, preparser=_preparse) -> None: + self.env = env + self.engine = engine + self.parser = parser + self.preparser = preparser + self.assigner = None + + def visit(self, node, **kwargs): + if isinstance(node, str): + clean = self.preparser(node) + try: + node = ast.fix_missing_locations(ast.parse(clean)) + except SyntaxError as e: + if any(iskeyword(x) for x in clean.split()): + e.msg = "Python keyword not valid identifier in numexpr query" + raise e + + method = f"visit_{type(node).__name__}" + visitor = getattr(self, method) + return visitor(node, **kwargs) + + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise SyntaxError("only a single expression is allowed") + expr = node.body[0] + return self.visit(expr, **kwargs) + + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) + + def _rewrite_membership_op(self, node, left, right): + # the kind of the operator (is actually an instance) + op_instance = node.op + op_type = type(op_instance) + + # must be two terms and the comparison operator must be ==/!=/in/not in + if is_term(left) and is_term(right) and op_type in self.rewrite_map: + left_list, right_list = map(_is_list, (left, right)) + left_str, right_str = map(_is_str, (left, right)) + + # if there are any strings or lists in the expression + if left_list or right_list or left_str or right_str: + op_instance = self.rewrite_map[op_type]() + + # pop the string variable out of locals and replace it with a list + # of one string, kind of a hack + if right_str: + name = self.env.add_tmp([right.value]) + right = self.term_type(name, self.env) + + if left_str: + name = self.env.add_tmp([left.value]) + left = self.term_type(name, self.env) + + op = self.visit(op_instance) + return op, op_instance, left, right + + def _maybe_transform_eq_ne(self, node, left=None, right=None): + if left is None: + left = self.visit(node.left, side="left") + if right is None: + right = self.visit(node.right, side="right") + op, op_class, left, right = self._rewrite_membership_op(node, left, right) + return op, op_class, left, right + + def _maybe_downcast_constants(self, left, right): + f32 = np.dtype(np.float32) + if ( + left.is_scalar + and hasattr(left, "value") + and not right.is_scalar + and right.return_type == f32 + ): + # right is a float32 array, left is a scalar + name = self.env.add_tmp(np.float32(left.value)) + left = self.term_type(name, self.env) + if ( + right.is_scalar + and hasattr(right, "value") + and not left.is_scalar + and left.return_type == f32 + ): + # left is a float32 array, right is a scalar + name = self.env.add_tmp(np.float32(right.value)) + right = self.term_type(name, self.env) + + return left, right + + def _maybe_eval(self, binop, eval_in_python): + # eval `in` and `not in` (for now) in "partial" python space + # things that can be evaluated in "eval" space will be turned into + # temporary variables. for example, + # [1,2] in a + 2 * b + # in that case a + 2 * b will be evaluated using numexpr, and the "in" + # call will be evaluated using isin (in python space) + return binop.evaluate( + self.env, self.engine, self.parser, self.term_type, eval_in_python + ) + + def _maybe_evaluate_binop( + self, + op, + op_class, + lhs, + rhs, + eval_in_python=("in", "not in"), + maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="), + ): + res = op(lhs, rhs) + + if res.has_invalid_return_type: + raise TypeError( + f"unsupported operand type(s) for {res.op}: " + f"'{lhs.type}' and '{rhs.type}'" + ) + + if self.engine != "pytables" and ( + res.op in CMP_OPS_SYMS + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._maybe_eval(res, self.binary_ops) + + if res.op in eval_in_python: + # "in"/"not in" ops are always evaluated in python + return self._maybe_eval(res, eval_in_python) + elif self.engine != "pytables": + if ( + getattr(lhs, "return_type", None) == object + or getattr(rhs, "return_type", None) == object + ): + # evaluate "==" and "!=" in python if either of our operands + # has an object return type + return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) + return res + + def visit_BinOp(self, node, **kwargs): + op, op_class, left, right = self._maybe_transform_eq_ne(node) + left, right = self._maybe_downcast_constants(left, right) + return self._maybe_evaluate_binop(op, op_class, left, right) + + def visit_Div(self, node, **kwargs): + return lambda lhs, rhs: Div(lhs, rhs) + + def visit_UnaryOp(self, node, **kwargs): + op = self.visit(node.op) + operand = self.visit(node.operand) + return op(operand) + + def visit_Name(self, node, **kwargs): + return self.term_type(node.id, self.env, **kwargs) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_NameConstant(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_Num(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + def visit_Constant(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_Str(self, node, **kwargs): + name = self.env.add_tmp(node.s) + return self.term_type(name, self.env) + + def visit_List(self, node, **kwargs): + name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) + return self.term_type(name, self.env) + + visit_Tuple = visit_List + + def visit_Index(self, node, **kwargs): + """df.index[4]""" + return self.visit(node.value) + + def visit_Subscript(self, node, **kwargs): + from pandas import eval as pd_eval + + value = self.visit(node.value) + slobj = self.visit(node.slice) + result = pd_eval( + slobj, local_dict=self.env, engine=self.engine, parser=self.parser + ) + try: + # a Term instance + v = value.value[result] + except AttributeError: + # an Op instance + lhs = pd_eval( + value, local_dict=self.env, engine=self.engine, parser=self.parser + ) + v = lhs[result] + name = self.env.add_tmp(v) + return self.term_type(name, env=self.env) + + def visit_Slice(self, node, **kwargs): + """df.index[slice(4,6)]""" + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower, upper, step) + + def visit_Assign(self, node, **kwargs): + """ + support a single assignment node, like + + c = a + b + + set the assigner at the top level, must be a Name node which + might or might not exist in the resolvers + + """ + if len(node.targets) != 1: + raise SyntaxError("can only assign a single expression") + if not isinstance(node.targets[0], ast.Name): + raise SyntaxError("left hand side of an assignment must be a single name") + if self.env.target is None: + raise ValueError("cannot assign without a target object") + + try: + assigner = self.visit(node.targets[0], **kwargs) + except UndefinedVariableError: + assigner = node.targets[0].id + + self.assigner = getattr(assigner, "name", assigner) + if self.assigner is None: + raise SyntaxError( + "left hand side of an assignment must be a single resolvable name" + ) + + return self.visit(node.value, **kwargs) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx + if isinstance(ctx, ast.Load): + # resolve the value + resolved = self.visit(value).value + try: + v = getattr(resolved, attr) + name = self.env.add_tmp(v) + return self.term_type(name, self.env) + except AttributeError: + # something like datetime.datetime where scope is overridden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + raise + + raise ValueError(f"Invalid Attribute context {type(ctx).__name__}") + + def visit_Call(self, node, side=None, **kwargs): + if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__": + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + else: + try: + res = self.visit(node.func) + except UndefinedVariableError: + # Check if this is a supported function name + try: + res = FuncNode(node.func.id) + except ValueError: + # Raise original error + raise + + if res is None: + # error: "expr" has no attribute "id" + raise ValueError( + f"Invalid function call {node.func.id}" # type: ignore[attr-defined] + ) + if hasattr(res, "value"): + res = res.value + + if isinstance(res, FuncNode): + new_args = [self.visit(arg) for arg in node.args] + + if node.keywords: + raise TypeError( + f'Function "{res.name}" does not support keyword arguments' + ) + + return res(*new_args) + + else: + new_args = [self.visit(arg)(self.env) for arg in node.args] + + for key in node.keywords: + if not isinstance(key, ast.keyword): + # error: "expr" has no attribute "id" + raise ValueError( + "keyword error in function call " # type: ignore[attr-defined] + f"'{node.func.id}'" + ) + + if key.arg: + kwargs[key.arg] = self.visit(key.value)(self.env) + + name = self.env.add_tmp(res(*new_args, **kwargs)) + return self.term_type(name=name, env=self.env) + + def translate_In(self, op): + return op + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + + # base case: we have something like a CMP b + if len(comps) == 1: + op = self.translate_In(ops[0]) + binop = ast.BinOp(op=op, left=node.left, right=comps[0]) + return self.visit(binop) + + # recursive case: we have a chained comparison, a CMP b CMP c, etc. + left = node.left + values = [] + for op, comp in zip(ops, comps): + new_node = self.visit( + ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)]) + ) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) + + def _try_visit_binop(self, bop): + if isinstance(bop, (Op, Term)): + return bop + return self.visit(bop) + + def visit_BoolOp(self, node, **kwargs): + def visitor(x, y): + lhs = self._try_visit_binop(x) + rhs = self._try_visit_binop(y) + + op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs) + return self._maybe_evaluate_binop(op, node.op, lhs, rhs) + + operands = node.values + return reduce(visitor, operands) + + +_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) + + +@disallow( + (_unsupported_nodes | _python_not_supported) + - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"])) +) +class PandasExprVisitor(BaseExprVisitor): + def __init__( + self, + env, + engine, + parser, + preparser=partial( + _preparse, + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), + ), + ) -> None: + super().__init__(env, engine, parser, preparser) + + +@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"])) +class PythonExprVisitor(BaseExprVisitor): + def __init__( + self, env, engine, parser, preparser=lambda source, f=None: source + ) -> None: + super().__init__(env, engine, parser, preparser=preparser) + + +class Expr: + """ + Object encapsulating an expression. + + Parameters + ---------- + expr : str + engine : str, optional, default 'numexpr' + parser : str, optional, default 'pandas' + env : Scope, optional, default None + level : int, optional, default 2 + """ + + env: Scope + engine: str + parser: str + + def __init__( + self, + expr, + engine: str = "numexpr", + parser: str = "pandas", + env: Scope | None = None, + level: int = 0, + ) -> None: + self.expr = expr + self.env = env or Scope(level=level + 1) + self.engine = engine + self.parser = parser + self._visitor = PARSERS[parser](self.env, self.engine, self.parser) + self.terms = self.parse() + + @property + def assigner(self): + return getattr(self._visitor, "assigner", None) + + def __call__(self): + return self.terms(self.env) + + def __repr__(self) -> str: + return printing.pprint_thing(self.terms) + + def __len__(self) -> int: + return len(self.expr) + + def parse(self): + """ + Parse an expression. + """ + return self._visitor.visit(self.expr) + + @property + def names(self): + """ + Get the names in an expression. + """ + if is_term(self.terms): + return frozenset([self.terms.name]) + return frozenset(term.name for term in com.flatten(self.terms)) + + +PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/third_party/bigframes_vendored/pandas/core/computation/ops.py b/third_party/bigframes_vendored/pandas/core/computation/ops.py new file mode 100644 index 0000000000..75b914c876 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/ops.py @@ -0,0 +1,605 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/ops.py +""" +Operator classes for eval. +""" + +from __future__ import annotations + +from datetime import datetime +from functools import partial +import operator +from typing import Callable, Literal, TYPE_CHECKING + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.common import ( + ensure_decoded, + result_type_many, +) +from bigframes_vendored.pandas.core.computation.scope import DEFAULT_GLOBALS +import numpy as np +from pandas._libs.tslibs import Timestamp +from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator + +REDUCTIONS = ("sum", "prod", "min", "max") + +_unary_math_ops = ( + "sin", + "cos", + "exp", + "log", + "expm1", + "log1p", + "sqrt", + "sinh", + "cosh", + "tanh", + "arcsin", + "arccos", + "arctan", + "arccosh", + "arcsinh", + "arctanh", + "abs", + "log10", + "floor", + "ceil", +) +_binary_math_ops = ("arctan2",) + +MATHOPS = _unary_math_ops + _binary_math_ops + + +LOCAL_TAG = "__pd_eval_local_" + + +class Term: + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, str) else cls + # error: Argument 2 for "super" not an instance of argument 1 + supr_new = super(Term, klass).__new__ # type: ignore[misc] + return supr_new(klass) + + is_local: bool + + def __init__(self, name, env, side=None, encoding=None) -> None: + # name is a str for Term, but may be something else for subclasses + self._name = name + self.env = env + self.side = side + tname = str(name) + self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS + self._value = self._resolve_name() + self.encoding = encoding + + @property + def local_name(self) -> str: + return self.name.replace(LOCAL_TAG, "") + + def __repr__(self) -> str: + return pprint_thing(self.name) + + def __call__(self, *args, **kwargs): + return self.value + + def evaluate(self, *args, **kwargs) -> Term: + return self + + def _resolve_name(self): + local_name = str(self.local_name) + is_local = self.is_local + if local_name in self.env.scope and isinstance( + self.env.scope[local_name], type + ): + is_local = False + + res = self.env.resolve(local_name, is_local=is_local) + self.update(res) + + if hasattr(res, "ndim") and res.ndim > 2: + raise NotImplementedError( + "N-dimensional objects, where N > 2, are not supported with eval" + ) + return res + + def update(self, value) -> None: + """ + search order for local (i.e., @variable) variables: + + scope, key_variable + [('locals', 'local_name'), + ('globals', 'local_name'), + ('locals', 'key'), + ('globals', 'key')] + """ + key = self.name + + # if it's a variable name (otherwise a constant) + if isinstance(key, str): + self.env.swapkey(self.local_name, key, new_value=value) + + self.value = value + + @property + def is_scalar(self) -> bool: + return is_scalar(self._value) + + @property + def type(self): + try: + # potentially very slow for large, mixed dtype frames + return self._value.values.dtype + except AttributeError: + try: + # ndarray + return self._value.dtype + except AttributeError: + # scalar + return type(self._value) + + return_type = type + + @property + def raw(self) -> str: + return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})" + + @property + def is_datetime(self) -> bool: + try: + t = self.type.type + except AttributeError: + t = self.type + + return issubclass(t, (datetime, np.datetime64)) + + @property + def value(self): + return self._value + + @value.setter + def value(self, new_value) -> None: + self._value = new_value + + @property + def name(self): + return self._name + + @property + def ndim(self) -> int: + return self._value.ndim + + +class Constant(Term): + def _resolve_name(self): + return self._name + + @property + def name(self): + return self.value + + def __repr__(self) -> str: + # in python 2 str() of float + # can truncate shorter than repr() + return repr(self.name) + + +_bool_op_map = {"not": "~", "and": "&", "or": "|"} + + +class Op: + """ + Hold an operator of arbitrary arity. + """ + + op: str + + def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None: + self.op = _bool_op_map.get(op, op) + self.operands = operands + self.encoding = encoding + + def __iter__(self) -> Iterator: + return iter(self.operands) + + def __repr__(self) -> str: + """ + Print a generic n-ary operator and its operands using infix notation. + """ + # recurse over the operands + parened = (f"({pprint_thing(opr)})" for opr in self.operands) + return pprint_thing(f" {self.op} ".join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): + return np.bool_ + return result_type_many(*(term.type for term in com.flatten(self))) + + @property + def has_invalid_return_type(self) -> bool: + types = self.operand_types + obj_dtype_set = frozenset([np.dtype("object")]) + return self.return_type == object and types - obj_dtype_set + + @property + def operand_types(self): + return frozenset(term.type for term in com.flatten(self)) + + @property + def is_scalar(self) -> bool: + return all(operand.is_scalar for operand in self.operands) + + @property + def is_datetime(self) -> bool: + try: + t = self.return_type.type + except AttributeError: + t = self.return_type + + return issubclass(t, (datetime, np.datetime64)) + + +def _in(x, y): + """ + Compute the vectorized membership of ``x in y`` if possible, otherwise + use Python. + """ + try: + return x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return y.isin(x) + except AttributeError: + pass + return x in y + + +def _not_in(x, y): + """ + Compute the vectorized membership of ``x not in y`` if possible, + otherwise use Python. + """ + try: + return ~x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return ~y.isin(x) + except AttributeError: + pass + return x not in y + + +CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in") +_cmp_ops_funcs = ( + operator.gt, + operator.lt, + operator.ge, + operator.le, + operator.eq, + operator.ne, + _in, + _not_in, +) +_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs)) + +BOOL_OPS_SYMS = ("&", "|", "and", "or") +_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) +_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs)) + +ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%") +_arith_ops_funcs = ( + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.pow, + operator.floordiv, + operator.mod, +) +_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) + +SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") +_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) +_special_case_arith_ops_dict = dict( + zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs) +) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: + """ + Cast an expression inplace. + + Parameters + ---------- + terms : Op + The expression that should cast. + acceptable_dtypes : list of acceptable numpy.dtype + Will not cast if term's dtype in this list. + dtype : str or numpy.dtype + The dtype to cast to. + """ + dt = np.dtype(dtype) + for term in terms: + if term.type in acceptable_dtypes: + continue + + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj) -> bool: + return isinstance(obj, Term) + + +class BinOp(Op): + """ + Hold a binary operator and its operands. + + Parameters + ---------- + op : str + lhs : Term or Op + rhs : Term or Op + """ + + def __init__(self, op: str, lhs, rhs) -> None: + super().__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + self._disallow_scalar_only_bool_ops() + + self.convert_values() + + try: + self.func = _binary_ops_dict[op] + except KeyError as err: + # has to be made a list for python3 + keys = list(_binary_ops_dict.keys()) + raise ValueError( + f"Invalid binary operator {repr(op)}, valid operators are {keys}" + ) from err + + def __call__(self, env): + """ + Recursively evaluate an expression in Python space. + + Parameters + ---------- + env : Scope + + Returns + ------- + object + The result of an evaluated expression. + """ + # recurse over the left/right nodes + left = self.lhs(env) + right = self.rhs(env) + + return self.func(left, right) + + def evaluate(self, env, engine: str, parser, term_type, eval_in_python): + """ + Evaluate a binary operation *before* being passed to the engine. + + Parameters + ---------- + env : Scope + engine : str + parser : str + term_type : type + eval_in_python : list + + Returns + ------- + term_type + The "pre-evaluated" expression as an instance of ``term_type`` + """ + if engine == "python": + res = self(env) + else: + # recurse over the left/right nodes + + left = self.lhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + right = self.rhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + # base cases + if self.op in eval_in_python: + res = self.func(left.value, right.value) + else: + from pandas.core.computation.eval import eval + + res = eval(self, local_dict=env, engine=engine, parser=parser) + + name = env.add_tmp(res) + return term_type(name, env=env) + + def convert_values(self) -> None: + """ + Convert datetimes to a comparable value in an expression. + """ + + def stringify(value): + encoder: Callable + if self.encoding is not None: + encoder = partial(pprint_thing_encoded, encoding=self.encoding) + else: + encoder = pprint_thing + return encoder(value) + + lhs, rhs = self.lhs, self.rhs + + if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar: + v = rhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.rhs.update(v) + + if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar: + v = lhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.lhs.update(v) + + def _disallow_scalar_only_bool_ops(self): + rhs = self.rhs + lhs = self.lhs + + # GH#24883 unwrap dtype if necessary to ensure we have a type object + rhs_rt = rhs.return_type + rhs_rt = getattr(rhs_rt, "type", rhs_rt) + lhs_rt = lhs.return_type + lhs_rt = getattr(lhs_rt, "type", lhs_rt) + if ( + (lhs.is_scalar or rhs.is_scalar) + and self.op in _bool_ops_dict + and ( + not ( + issubclass(rhs_rt, (bool, np.bool_)) + and issubclass(lhs_rt, (bool, np.bool_)) + ) + ) + ): + raise NotImplementedError("cannot evaluate scalar only bool ops") + + +def isnumeric(dtype) -> bool: + return issubclass(np.dtype(dtype).type, np.number) + + +class Div(BinOp): + """ + Div operator to special case casting. + + Parameters + ---------- + lhs, rhs : Term or Op + The Terms or Ops in the ``/`` expression. + """ + + def __init__(self, lhs, rhs) -> None: + super().__init__("/", lhs, rhs) + + if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): + raise TypeError( + f"unsupported operand type(s) for {self.op}: " + f"'{lhs.return_type}' and '{rhs.return_type}'" + ) + + # do not upcast float32s to float64 un-necessarily + acceptable_dtypes = [np.float32, np.float64] + _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) + + +UNARY_OPS_SYMS = ("+", "-", "~", "not") +_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) +_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) + + +class UnaryOp(Op): + """ + Hold a unary operator and its operands. + + Parameters + ---------- + op : str + The token used to represent the operator. + operand : Term or Op + The Term or Op operand to the operator. + + Raises + ------ + ValueError + * If no function associated with the passed operator token is found. + """ + + def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: + super().__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError as err: + raise ValueError( + f"Invalid unary operator {repr(op)}, " + f"valid operators are {UNARY_OPS_SYMS}" + ) from err + + def __call__(self, env) -> MathCall: + operand = self.operand(env) + # error: Cannot call function of unknown type + return self.func(operand) # type: ignore[operator] + + def __repr__(self) -> str: + return pprint_thing(f"{self.op}({self.operand})") + + @property + def return_type(self) -> np.dtype: + operand = self.operand + if operand.return_type == np.dtype("bool"): + return np.dtype("bool") + if isinstance(operand, Op) and ( + operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict + ): + return np.dtype("bool") + return np.dtype("int") + + +class MathCall(Op): + def __init__(self, func, args) -> None: + super().__init__(func.name, args) + self.func = func + + def __call__(self, env): + # error: "Op" not callable + operands = [op(env) for op in self.operands] # type: ignore[operator] + return self.func.func(*operands) + + def __repr__(self) -> str: + operands = map(str, self.operands) + return pprint_thing(f"{self.op}({','.join(operands)})") + + +class FuncNode: + def __init__(self, name: str) -> None: + if name not in MATHOPS: + raise ValueError(f'"{name}" is not a supported function') + self.name = name + self.func = getattr(np, name) + + def __call__(self, *args): + return MathCall(self, args) diff --git a/third_party/bigframes_vendored/pandas/core/computation/parsing.py b/third_party/bigframes_vendored/pandas/core/computation/parsing.py new file mode 100644 index 0000000000..e54f459735 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/parsing.py @@ -0,0 +1,196 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/parsing.py +""" +:func:`~pandas.eval` source string parsing functions +""" +from __future__ import annotations + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Hashable, Iterator + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # token.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items()) + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + "°": "_DEGREESIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join([special_characters_replacements.get(char, char) for char in name]) + name = f"BACKTICK_QUOTED_STRING_{name}" + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: Hashable) -> Hashable: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : hashable + Name to be cleaned. + + Returns + ------- + name : hashable + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not caught and propagates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception as err: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err + else: + yield toknum, tokval diff --git a/third_party/bigframes_vendored/pandas/core/computation/scope.py b/third_party/bigframes_vendored/pandas/core/computation/scope.py new file mode 100644 index 0000000000..bfd7eb1d12 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/scope.py @@ -0,0 +1,355 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/scope.py +""" +Module for scope operations +""" +from __future__ import annotations + +from collections import ChainMap +import datetime +import inspect +from io import StringIO +import itertools +import pprint +import struct +import sys +from typing import TypeVar + +import numpy as np +from pandas._libs.tslibs import Timestamp +from pandas.errors import UndefinedVariableError + +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") + + +# https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes +class DeepChainMap(ChainMap[_KT, _VT]): + """ + Variant of ChainMap that allows direct updates to inner scopes. + + Only works when all passed mapping are mutable. + """ + + def __setitem__(self, key: _KT, value: _VT) -> None: + for mapping in self.maps: + if key in mapping: + mapping[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key: _KT) -> None: + """ + Raises + ------ + KeyError + If `key` doesn't exist. + """ + for mapping in self.maps: + if key in mapping: + del mapping[key] + return + raise KeyError(key) + + +def ensure_scope( + level: int, global_dict=None, local_dict=None, resolvers=(), target=None +) -> Scope: + """Ensure that we are grabbing the correct scope.""" + return Scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + +def _replacer(x) -> str: + """ + Replace a number with its hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin) + + +def _raw_hex_id(obj) -> str: + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack("@P", id(obj)) + return "".join([_replacer(x) for x in packed]) + + +DEFAULT_GLOBALS = { + "Timestamp": Timestamp, + "datetime": datetime.datetime, + "True": True, + "False": False, + "list": list, + "tuple": tuple, + "inf": np.inf, + "Inf": np.inf, +} + + +def _get_pretty_string(obj) -> str: + """ + Return a prettier version of obj. + + Parameters + ---------- + obj : object + Object to pretty print + + Returns + ------- + str + Pretty print object repr + """ + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope: + """ + Object to hold scope, with a few bells to deal with some custom syntax + and contexts added by pandas. + + Parameters + ---------- + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None + resolvers : list-like or None, optional, default None + target : object + + Attributes + ---------- + level : int + scope : DeepChainMap + target : object + temps : dict + """ + + __slots__ = ["level", "scope", "target", "resolvers", "temps"] + level: int + scope: DeepChainMap + resolvers: DeepChainMap + temps: dict + + def __init__( + self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None + ) -> None: + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(DEFAULT_GLOBALS.copy()) + self.target = target + + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self._update(local_dict.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + scope_global = self.scope.new_child( + (global_dict if global_dict is not None else frame.f_globals).copy() + ) + self.scope = DeepChainMap(scope_global) + if not isinstance(local_dict, Scope): + scope_local = self.scope.new_child( + (local_dict if local_dict is not None else frame.f_locals).copy() + ) + self.scope = DeepChainMap(scope_local) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + self.temps = {} + + def __repr__(self) -> str: + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) + return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" + + @property + def has_resolvers(self) -> bool: + """ + Return whether we have any extra scope. + + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) + + def resolve(self, key: str, is_local: bool): + """ + Resolve a variable name in a possibly local context. + + Parameters + ---------- + key : str + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError as err: + raise UndefinedVariableError(key, is_local) from err + + def swapkey(self, old_key: str, new_key: str, new_value=None) -> None: + """ + Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + maps.append(self.temps) + + for mapping in maps: + if old_key in mapping: + mapping[new_key] = new_value + return + + def _get_vars(self, stack, scopes: list[str]) -> None: + """ + Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, f"f_{scope}") + self.scope = DeepChainMap(self.scope.new_child(d)) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def _update(self, level: int) -> None: + """ + Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=["locals"]) + finally: + del stack[:], stack + + def add_tmp(self, value) -> str: + """ + Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + str + The name of the temporary variable created. + """ + name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}" + + # add to inner most scope + assert name not in self.temps + self.temps[name] = value + assert name in self.temps + + # only increment if the variable gets put in the scope + return name + + @property + def ntemps(self) -> int: + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self) -> DeepChainMap: + """ + Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/third_party/bigframes_vendored/pandas/core/config_init.py b/third_party/bigframes_vendored/pandas/core/config_init.py index 33c6b3e093..a3178e2761 100644 --- a/third_party/bigframes_vendored/pandas/core/config_init.py +++ b/third_party/bigframes_vendored/pandas/core/config_init.py @@ -32,9 +32,21 @@ User can execute the job by calling .to_pandas() >>> # df.to_pandas() -Reset option +Reset repr_mode option >>> bpd.options.display.repr_mode = "head" +Can also set the progress_bar option to see the progress bar in terminal, + >>> bpd.options.display.progress_bar = "terminal" + +notebook, + >>> bpd.options.display.progress_bar = "notebook" + +or just remove it. + >>> bpd.options.display.progress_bar = None + +Setting to default value "auto" will detect and show progress bar automatically. + >>> bpd.options.display.progress_bar = "auto" + Attributes: max_columns (int, default 20): If `max_columns` is exceeded, switch to truncate view. @@ -47,18 +59,18 @@ repr_mode (Literal[`head`, `deferred`]): `head`: Execute, download, and display results (limited to head) from - dataframe and series objects during repr. + Dataframe and Series objects during repr. `deferred`: - Prevent executions from repr statements in dataframe and series objects. - Instead estimated bytes processed will be shown. Dataframe and Series + Prevent executions from repr statements in DataFrame and Series objects. + Instead, estimated bytes processed will be shown. DataFrame and Series objects can still be computed with methods that explicitly execute and download results. max_info_columns (int): max_info_columns is used in DataFrame.info method to decide if - per column information will be printed. + information in each column will be printed. max_info_rows (int or None): df.info() will usually show null-counts for each column. - For large frames this can be quite slow. max_info_rows and max_info_cols + For large frames, this can be quite slow. max_info_rows and max_info_cols limit this null check only to frames with smaller dimensions than specified. memory_usage (bool): diff --git a/third_party/bigframes_vendored/pandas/core/dtypes/inference.py b/third_party/bigframes_vendored/pandas/core/dtypes/inference.py new file mode 100644 index 0000000000..fcbb4c242f --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/dtypes/inference.py @@ -0,0 +1,31 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/dtypes/inference.py +""" basic inference routines """ + +from __future__ import annotations + +from collections import abc + + +def iterable_not_string(obj) -> bool: + """ + Check if the object is an iterable but not a string. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter_not_string : bool + Whether `obj` is a non-string iterable. + + Examples + -------- + >>> iterable_not_string([1, 2, 3]) + True + >>> iterable_not_string("foo") + False + >>> iterable_not_string(1) + False + """ + return isinstance(obj, abc.Iterable) and not isinstance(obj, str) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d70d3827e7..e5aa47ad3e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2804,6 +2804,57 @@ def combine_first(self, other) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def explode( + self, column: Union[str, Sequence[str]], *, ignore_index: Optional[bool] = False + ) -> DataFrame: + """ + Transform each element of an array to a row, replicating index values. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [[0, 1, 2], [], [], [3, 4]], + ... 'B': 1, + ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) + >>> df.explode('A') + A B C + 0 0 1 ['a' 'b' 'c'] + 0 1 1 ['a' 'b' 'c'] + 0 2 1 ['a' 'b' 'c'] + 1 1 [] + 2 1 [] + 3 3 1 ['d' 'e'] + 3 4 1 ['d' 'e'] + + [7 rows x 3 columns] + >>> df.explode(list('AC')) + A B C + 0 0 1 a + 0 1 1 b + 0 2 1 c + 1 1 + 2 1 + 3 3 1 d + 3 4 1 e + + [7 rows x 3 columns] + + Args: + column (str, Sequence[str]): + Column(s) to explode. For multiple columns, specify a non-empty list + with each element be str or tuple, and all specified columns their + list-like data on same row of the frame must have matching length. + ignore_index (bool, default False): + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns: + bigframes.series.DataFrame: Exploded lists to rows of the subset columns; + index will be duplicated for these rows. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def corr(self, method, min_periods, numeric_only) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. @@ -2865,6 +2916,7 @@ def cov(self, *, numeric_only) -> DataFrame: Returns: DataFrame: The covariance matrix of the series of the DataFrame. """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None @@ -4415,10 +4467,10 @@ def cumprod(self) -> DataFrame: [3 rows x 2 columns] >>> df.cumprod() - A B - 0 3 1 - 1 3 2 - 2 6 6 + A B + 0 3.0 1.0 + 1 3.0 2.0 + 2 6.0 6.0 [3 rows x 2 columns] @@ -4796,7 +4848,7 @@ def index(self): MultiIndex([( 'Alice', 'Seattle'), ( 'Bob', 'New York'), ('Aritra', 'Kona')], - name='Name') + names=['Name', 'Location']) >>> df1.index.values array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], dtype=object) @@ -4931,6 +4983,158 @@ def value_counts( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def eval(self, expr: str) -> DataFrame: + """ + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + [5 rows x 2 columns] + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: Int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + + [5 rows x 3 columns] + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + [5 rows x 2 columns] + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + + [5 rows x 4 columns] + + + Args: + expr (str): + The expression string to evaluate. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def query(self, expr: str) -> DataFrame | None: + """ + Query the columns of a DataFrame with a boolean expression. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) + >>> df + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + + [5 rows x 3 columns] + >>> df.query('A > B') + A B C C + 4 5 2 6 + + [1 rows x 3 columns] + + The previous expression is equivalent to + + >>> df[df.A > df.B] + A B C C + 4 5 2 6 + + [1 rows x 3 columns] + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + [1 rows x 3 columns] + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 + + [1 rows x 3 columns] + + Args: + expr (str): + The query string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def interpolate(self, method: str = "linear"): """ Fill NaN values using an interpolation method. diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 8730cf0007..e1cc8c5a53 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -78,7 +78,7 @@ def median( Include only float, int, boolean columns. exact (bool, default False): Calculate the exact median instead of an approximation. Note: - ``exact=True`` not yet supported. + ``exact=True`` is not supported. Returns: pandas.Series or pandas.DataFrame: Median of groups. @@ -178,7 +178,7 @@ def sum( Include only float, int, boolean columns. min_count (int, default 0): The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` and non-NA values are present, the result will be NA. Returns: Series or DataFrame: Computed sum of values within each group. @@ -194,7 +194,7 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): Include only float, int, boolean columns. min_count (int, default 0): The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` and non-NA values are present, the result will be NA. Returns: Series or DataFrame: Computed prod of values within each group. @@ -214,7 +214,7 @@ def min( Include only float, int, boolean columns. min_count (int, default 0): The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` and non-NA values are present, the result will be NA. Returns: Series or DataFrame: Computed min of values within each group. @@ -234,7 +234,7 @@ def max( Include only float, int, boolean columns. min_count (int, default 0): The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` and non-NA values are present, the result will be NA. Returns: Series or DataFrame: Computed max of values within each group. diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 3ad8729271..7f5761e45b 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -8,6 +8,18 @@ class Index: """Immutable sequence used for indexing and alignment. The basic object storing axis labels for all objects. + + Args: + data (pandas.Series | pandas.Index | bigframes.series.Series | bigframes.core.indexes.base.Index): + Labels (1-dimensional). + dtype: + Data type for the output Index. If not specified, this will be + inferred from `data`. + name: + Name to be stored in the index. + session (Optional[bigframes.session.Session]): + BigQuery DataFrames session where queries are run. If not set, + a default session is used. """ @property diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 0aebd0660f..785755a562 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3,7 +3,7 @@ """ from __future__ import annotations -from typing import Hashable, IO, Literal, Mapping, Sequence, TYPE_CHECKING +from typing import Hashable, IO, Literal, Mapping, Optional, Sequence, TYPE_CHECKING from bigframes_vendored.pandas.core.generic import NDFrame import numpy as np @@ -87,7 +87,7 @@ def index(self): MultiIndex([( 'Alice', 'Seattle'), ( 'Bob', 'New York'), ('Aritra', 'Kona')], - name='Name') + names=['Name', 'Location']) >>> s1.index.values array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], dtype=object) @@ -751,6 +751,34 @@ def round(self, decimals: int = 0) -> Series: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def explode(self, *, ignore_index: Optional[bool] = False) -> Series: + """ + Transform each element of a list-like to a row. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([[1, 2, 3], [], [3, 4]]) + >>> s.explode() + 0 1 + 0 2 + 0 3 + 1 + 2 3 + 2 4 + dtype: Int64 + + Args: + ignore_index (bool, default False): + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns: + bigframes.series.Series: Exploded lists to rows; index will be duplicated for these rows. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def corr(self, other, method="pearson", min_periods=None) -> float: """ Compute the correlation with the other Series. Non-number values are ignored in the diff --git a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py index 8a8a562bae..442220f237 100644 --- a/third_party/bigframes_vendored/pandas/core/tools/datetimes.py +++ b/third_party/bigframes_vendored/pandas/core/tools/datetimes.py @@ -48,7 +48,7 @@ def to_datetime( >>> bpd.to_datetime(list_str, format="%m-%d-%Y %H:%M", utc=True) 0 2021-01-31 14:30:00+00:00 1 2021-02-28 15:45:00+00:00 - Name: 0, dtype: timestamp[us, tz=UTC][pyarrow] + dtype: timestamp[us, tz=UTC][pyarrow] Converting a Series of Strings with Timezone Information: diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index f8da9efdc0..19f56965df 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -257,9 +257,6 @@ def scatter( - A string with the name of the column to be used for marker's size. - A single scalar so all points have the same size. - - A sequence of scalars, which will be used for each point's size - recursively. For instance, when passing [2,14] all points size - will be either 2 or 14, alternatively. c (str, int or array-like, optional): The color of each point. Possible values are: diff --git a/third_party/bigframes_vendored/pandas/util/_exceptions.py b/third_party/bigframes_vendored/pandas/util/_exceptions.py new file mode 100644 index 0000000000..4ca649153a --- /dev/null +++ b/third_party/bigframes_vendored/pandas/util/_exceptions.py @@ -0,0 +1,29 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/util/_exceptions.py +from __future__ import annotations + +import inspect +import os + + +def find_stack_level() -> int: + """ + Find the first place in the stack that is not inside pandas + (tests notwithstanding). + """ + + import pandas as pd + + pkg_dir = os.path.dirname(pd.__file__) + test_dir = os.path.join(pkg_dir, "tests") + + # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow + frame = inspect.currentframe() + n = 0 + while frame: + fname = inspect.getfile(frame) + if fname.startswith(pkg_dir) and not fname.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + return n diff --git a/third_party/bigframes_vendored/pandas/util/_validators.py b/third_party/bigframes_vendored/pandas/util/_validators.py new file mode 100644 index 0000000000..1f36e0d528 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/util/_validators.py @@ -0,0 +1,58 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/util/_validators.py +""" +Module that contains many useful utilities +for validating data or function arguments +""" +from __future__ import annotations + +from typing import TypeVar + +from pandas.core.dtypes.common import is_bool + +BoolishT = TypeVar("BoolishT", bool, int) +BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) + + +def validate_bool_kwarg( + value: BoolishNoneT, + arg_name: str, + none_allowed: bool = True, + int_allowed: bool = False, +) -> BoolishNoneT: + """ + Ensure that argument passed in arg_name can be interpreted as boolean. + + Parameters + ---------- + value : bool + Value to be validated. + arg_name : str + Name of the argument. To be reflected in the error message. + none_allowed : bool, default True + Whether to consider None to be a valid boolean. + int_allowed : bool, default False + Whether to consider integer value to be a valid boolean. + + Returns + ------- + value + The same value as input. + + Raises + ------ + ValueError + If the value is not a valid boolean. + """ + good_value = is_bool(value) + if none_allowed: + good_value = good_value or (value is None) + + if int_allowed: + good_value = good_value or isinstance(value, int) + + if not good_value: + raise ValueError( + f'For argument "{arg_name}" expected type bool, received ' + f"type {type(value).__name__}." + ) + return value # pyright: ignore[reportGeneralTypeIssues] diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py index b08eb10492..4b0bd42706 100644 --- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py +++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py @@ -19,9 +19,9 @@ class ColumnTransformer(_BaseComposition): """Applies transformers to columns of BigQuery DataFrames. This estimator allows different columns or column subsets of the input - to be transformed separately and the features generated by each transformer + to be transformed separately, and the features generated by each transformer will be concatenated to form a single feature space. - This is useful for heterogeneous or columnar data, to combine several + This is useful for heterogeneous or columnar data to combine several feature extraction mechanisms or transformations into a single transformer. Args: diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index dcce75d1d9..f126e0439d 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -22,8 +22,8 @@ class PCA(BaseEstimator, metaclass=ABCMeta): Args: n_components (int, float or None, default None): - Number of components to keep. - If n_components is not set all components are kept. n_components = min(n_samples, n_features). + Number of components to keep. If n_components is not set all + components are kept, n_components = min(n_samples, n_features). If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. svd_solver ("full", "randomized" or "auto", default "auto"): The solver to use to calculate the principal components. Details: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-pca#pca_solver. diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 88ff32ea06..494c730a6d 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -38,7 +38,7 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Dict isn't - supported now. + supported. l1_reg (float or None, default None): The amount of L1 regularization applied. Default to None. Can't be set in "normal_equation" mode. If unset, value 0 is used. l2_reg (float, default 0.0): diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index ac919edbe3..dee8b350c0 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -27,6 +27,29 @@ def auc(x, y) -> float: way to summarize a precision-recall curve, see :func:`average_precision_score`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> x = bpd.DataFrame([1, 1, 2, 2]) + >>> y = bpd.DataFrame([2, 3, 4, 5]) + >>> auc = bigframes.ml.metrics.auc(x, y) + >>> auc + 3.5 + + The input can be Series: + + >>> df = bpd.DataFrame( + ... {"x": [1, 1, 2, 2], + ... "y": [2, 3, 4, 5],} + ... ) + >>> auc = bigframes.ml.metrics.auc(df["x"], df["y"]) + >>> auc + 3.5 + + Args: x (Series or DataFrame of shape (n_samples,)): X coordinates. These must be either monotonic increasing or monotonic @@ -44,6 +67,28 @@ def roc_auc_score(y_true, y_score) -> float: """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) \ from prediction scores. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> y_true = bpd.DataFrame([0, 0, 1, 1, 0, 1, 0, 1, 1, 1]) + >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]) + >>> roc_auc_score = bigframes.ml.metrics.roc_auc_score(y_true, y_score) + >>> roc_auc_score + 0.625 + + The input can be Series: + + >>> df = bpd.DataFrame( + ... {"y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], + ... "y_score": [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45],} + ... ) + >>> roc_auc_score = bigframes.ml.metrics.roc_auc_score(df["y_true"], df["y_score"]) + >>> roc_auc_score + 0.625 + Args: y_true (Series or DataFrame of shape (n_samples,)): True labels or binary label indicators. The binary and multiclass cases @@ -72,6 +117,39 @@ def roc_curve( ): """Compute Receiver operating characteristic (ROC). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> y_true = bpd.DataFrame([1, 1, 2, 2]) + >>> y_score = bpd.DataFrame([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = bigframes.ml.metrics.roc_curve(y_true, y_score, drop_intermediate=False) + >>> fpr + 0 0.0 + 1 0.0 + 2 0.0 + 3 0.0 + 4 0.0 + Name: fpr, dtype: Float64 + + >>> tpr + 0 0.0 + 1 0.333333 + 2 0.5 + 3 0.833333 + 4 1.0 + Name: tpr, dtype: Float64 + + >>> thresholds + 0 inf + 1 0.8 + 2 0.4 + 3 0.35 + 4 0.1 + Name: thresholds, dtype: Float64 + Args: y_true: Series or DataFrame of shape (n_samples,) True binary labels. If labels are not either {-1, 1} or {0, 1}, then diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index be531a9b1c..c3e579bd29 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -64,3 +64,30 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: float: The :math:`R^2` score. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +def mean_squared_error(y_true, y_pred) -> float: + """Mean squared error regression loss. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.ml.metrics + >>> bpd.options.display.progress_bar = None + + >>> y_true = bpd.DataFrame([3, -0.5, 2, 7]) + >>> y_pred = bpd.DataFrame([2.5, 0.0, 2, 8]) + >>> mse = bigframes.ml.metrics.mean_squared_error(y_true, y_pred) + >>> mse + 0.375 + + Args: + y_true (Series or DataFrame of shape (n_samples,)): + Ground truth (correct) target values. + y_pred (Series or DataFrame of shape (n_samples,)): + Estimated target values. + + Returns: + float: Mean squared error. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py index 5fcc481573..98b9d0371f 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -18,7 +18,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): strategy ({'uniform', 'quantile'}, default='quantile'): Strategy used to define the widths of the bins. 'uniform': All bins in each feature have identical widths. 'quantile': All bins in each - feature have the same number of points. Only `uniform` is supported now. + feature have the same number of points. Only `uniform` is supported. """ def fit(self, X, y=None): diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index 250e34dc2c..424b17a371 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -1,4 +1,4 @@ -"""Scikit-Learn Wrapper interface for XGBoost.""" +"""scikit-learn Wrapper interface for XGBoost.""" from typing import Any