diff --git a/notebooks/CodeFill.ipynb b/notebooks/CodeFill.ipynb index 4761044..b26ce26 100644 --- a/notebooks/CodeFill.ipynb +++ b/notebooks/CodeFill.ipynb @@ -1,1388 +1,1391 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "CodeFill.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Install the correct dependencies on HuggingFace transformer and ternsorflow" + ], + "metadata": { + "id": "xIT_uHUdThub" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "name": "CodeFill.ipynb", - "provenance": [], - "collapsed_sections": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" + "id": "f3KWTckbTUs2", + "outputId": "2012bedd-8fb1-4909-d4bc-1f0ce0ad416e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Found existing installation: tensorflow 2.8.0\n", + "Uninstalling tensorflow-2.8.0:\n", + " Successfully uninstalled tensorflow-2.8.0\n", + "Collecting git+https://github.com/huggingface/transformers\n", + " Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-nl4u3bjj\n", + " Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-nl4u3bjj\n", + " Installing build dependencies ... \u001B[?25l\u001B[?25hdone\n", + " Getting requirements to build wheel ... \u001B[?25l\u001B[?25hdone\n", + " Preparing wheel metadata ... \u001B[?25l\u001B[?25hdone\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (2.23.0)\n", + "Collecting huggingface-hub<1.0,>=0.1.0\n", + " Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)\n", + "\u001B[K |████████████████████████████████| 77 kB 4.2 MB/s \n", + "\u001B[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (2019.12.20)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (4.11.3)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (21.3)\n", + "Collecting pyyaml>=5.1\n", + " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", + "\u001B[K |████████████████████████████████| 596 kB 16.8 MB/s \n", + "\u001B[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1\n", + " Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n", + "\u001B[K |████████████████████████████████| 6.6 MB 42.7 MB/s \n", + "\u001B[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (3.6.0)\n", + "Collecting sacremoses\n", + " Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)\n", + "\u001B[K |████████████████████████████████| 895 kB 44.5 MB/s \n", + "\u001B[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (4.64.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (1.21.6)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers==4.19.0.dev0) (4.2.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers==4.19.0.dev0) (3.0.8)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.19.0.dev0) (3.8.0)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (2021.10.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (3.0.4)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (1.1.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (7.1.2)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (1.15.0)\n", + "Building wheels for collected packages: transformers\n", + " Building wheel for transformers (PEP 517) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for transformers: filename=transformers-4.19.0.dev0-py3-none-any.whl size=4040857 sha256=84073fb7ad0bd8a06a08636e44e15aadf97eb4dd26c2d2a0e3dfc90232f8158b\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-ii8u1on2/wheels/35/2e/a7/d819e3310040329f0f47e57c9e3e7a7338aa5e74c49acfe522\n", + "Successfully built transformers\n", + "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", + " Attempting uninstall: pyyaml\n", + " Found existing installation: PyYAML 3.13\n", + " Uninstalling PyYAML-3.13:\n", + " Successfully uninstalled PyYAML-3.13\n", + "Successfully installed huggingface-hub-0.5.1 pyyaml-6.0 sacremoses-0.0.49 tokenizers-0.12.1 transformers-4.19.0.dev0\n", + "tokenizers 0.12.1\n", + "transformers 4.19.0.dev0\n", + "Collecting nlp==0.2.0\n", + " Downloading nlp-0.2.0-py3-none-any.whl (857 kB)\n", + "\u001B[K |████████████████████████████████| 857 kB 8.9 MB/s \n", + "\u001B[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (3.6.0)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (2.23.0)\n", + "Requirement already satisfied: pyarrow>=0.16.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (6.0.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (1.21.6)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (4.64.0)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (0.3.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2021.10.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2.10)\n", + "Installing collected packages: nlp\n", + "Successfully installed nlp-0.2.0\n", + "Collecting datasets\n", + " Downloading datasets-2.1.0-py3-none-any.whl (325 kB)\n", + "\u001B[K |████████████████████████████████| 325 kB 8.6 MB/s \n", + "\u001B[?25hRequirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.4)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.12.2)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.21.6)\n", + "Collecting fsspec[http]>=2021.05.0\n", + " Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)\n", + "\u001B[K |████████████████████████████████| 136 kB 60.7 MB/s \n", + "\u001B[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.11.3)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.5.1)\n", + "Collecting xxhash\n", + " Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", + "\u001B[K |████████████████████████████████| 212 kB 49.9 MB/s \n", + "\u001B[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.3.5)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.64.0)\n", + "Collecting aiohttp\n", + " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", + "\u001B[K |████████████████████████████████| 1.1 MB 48.8 MB/s \n", + "\u001B[?25hRequirement already satisfied: pyarrow>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n", + "Collecting responses<0.19\n", + " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.2.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.6.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", + "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n", + " Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n", + "\u001B[K |████████████████████████████████| 127 kB 59.6 MB/s \n", + "\u001B[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.4.0)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.12)\n", + "Collecting asynctest==0.13.0\n", + " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", + "Collecting multidict<7.0,>=4.5\n", + " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", + "\u001B[K |████████████████████████████████| 94 kB 3.7 MB/s \n", + "\u001B[?25hCollecting yarl<2.0,>=1.0\n", + " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", + "\u001B[K |████████████████████████████████| 271 kB 57.7 MB/s \n", + "\u001B[?25hCollecting async-timeout<5.0,>=4.0.0a3\n", + " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", + "Collecting frozenlist>=1.1.1\n", + " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", + "\u001B[K |████████████████████████████████| 144 kB 62.5 MB/s \n", + "\u001B[?25hCollecting aiosignal>=1.1.2\n", + " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.8.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2022.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", + "Installing collected packages: multidict, frozenlist, yarl, urllib3, asynctest, async-timeout, aiosignal, fsspec, aiohttp, xxhash, responses, datasets\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 1.24.3\n", + " Uninstalling urllib3-1.24.3:\n", + " Successfully uninstalled urllib3-1.24.3\n", + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "kapre 0.3.7 requires tensorflow>=2.0.0, which is not installed.\n", + "datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\u001B[0m\n", + "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-2.1.0 frozenlist-1.3.0 fsspec-2022.3.0 multidict-6.0.2 responses-0.18.0 urllib3-1.25.11 xxhash-3.0.0 yarl-1.7.2\n", + "Collecting git+https://github.com/huggingface/nlp\n", + " Cloning https://github.com/huggingface/nlp to /tmp/pip-req-build-zp5gwnk_\n", + " Running command git clone -q https://github.com/huggingface/nlp /tmp/pip-req-build-zp5gwnk_\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (1.21.6)\n", + "Requirement already satisfied: pyarrow>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (6.0.1)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.3.4)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (1.3.5)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (2.23.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (4.64.0)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (3.0.0)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.70.12.2)\n", + "Requirement already satisfied: fsspec[http]>=2021.05.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (2022.3.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (3.8.1)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.5.1)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (21.3)\n", + "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.18.0)\n", + "Requirement already satisfied: importlib_metadata in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (4.11.3)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (3.6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (4.2.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (6.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets==2.1.1.dev0) (3.0.8)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (2021.10.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (1.25.11)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (3.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (4.0.2)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (2.0.12)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.3.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.2.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (21.4.0)\n", + "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (0.13.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.7.2)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (6.0.2)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib_metadata->datasets==2.1.1.dev0) (3.8.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.1.1.dev0) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.1.1.dev0) (2022.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets==2.1.1.dev0) (1.15.0)\n", + "Building wheels for collected packages: datasets\n", + " Building wheel for datasets (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for datasets: filename=datasets-2.1.1.dev0-py3-none-any.whl size=327456 sha256=d7cadb2d89edc35e658adda0904ba4c5a7a20d39e04a3060fbdc3888fc78f67f\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-yre84zb1/wheels/b7/b2/b6/a0b4e0d11cb66d705e54f7bb72fdbe910b5e9f198ada8b4347\n", + "Successfully built datasets\n", + "Installing collected packages: datasets\n", + " Attempting uninstall: datasets\n", + " Found existing installation: datasets 2.1.0\n", + " Uninstalling datasets-2.1.0:\n", + " Successfully uninstalled datasets-2.1.0\n", + "Successfully installed datasets-2.1.1.dev0\n" + ] } + ], + "source": [ + "# We won't need TensorFlow here\n", + "!pip uninstall -y tensorflow\n", + "# Install `transformers` from master\n", + "!pip install git+https://github.com/huggingface/transformers\n", + "!pip list | grep -E 'transformers|tokenizers'\n", + "!pip install nlp==0.2.0\n", + "!pip install datasets\n", + "!pip install git+https://github.com/huggingface/nlp\n", + "\n", + "# transformers version at notebook update --- 2.11.0\n", + "# tokenizers version at notebook update --- 0.8.0rc1" + ] }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "Install the correct dependencies on HuggingFace transformer and ternsorflow" - ], - "metadata": { - "id": "xIT_uHUdThub" - } - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f3KWTckbTUs2", - "outputId": "2012bedd-8fb1-4909-d4bc-1f0ce0ad416e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Found existing installation: tensorflow 2.8.0\n", - "Uninstalling tensorflow-2.8.0:\n", - " Successfully uninstalled tensorflow-2.8.0\n", - "Collecting git+https://github.com/huggingface/transformers\n", - " Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-nl4u3bjj\n", - " Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-nl4u3bjj\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (2.23.0)\n", - "Collecting huggingface-hub<1.0,>=0.1.0\n", - " Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)\n", - "\u001b[K |████████████████████████████████| 77 kB 4.2 MB/s \n", - "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (2019.12.20)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (4.11.3)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (21.3)\n", - "Collecting pyyaml>=5.1\n", - " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", - "\u001b[K |████████████████████████████████| 596 kB 16.8 MB/s \n", - "\u001b[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1\n", - " Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n", - "\u001b[K |████████████████████████████████| 6.6 MB 42.7 MB/s \n", - "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (3.6.0)\n", - "Collecting sacremoses\n", - " Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)\n", - "\u001b[K |████████████████████████████████| 895 kB 44.5 MB/s \n", - "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (4.64.0)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (1.21.6)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers==4.19.0.dev0) (4.2.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers==4.19.0.dev0) (3.0.8)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.19.0.dev0) (3.8.0)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (1.24.3)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (2021.10.8)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (3.0.4)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (1.1.0)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (7.1.2)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (1.15.0)\n", - "Building wheels for collected packages: transformers\n", - " Building wheel for transformers (PEP 517) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for transformers: filename=transformers-4.19.0.dev0-py3-none-any.whl size=4040857 sha256=84073fb7ad0bd8a06a08636e44e15aadf97eb4dd26c2d2a0e3dfc90232f8158b\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-ii8u1on2/wheels/35/2e/a7/d819e3310040329f0f47e57c9e3e7a7338aa5e74c49acfe522\n", - "Successfully built transformers\n", - "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", - " Attempting uninstall: pyyaml\n", - " Found existing installation: PyYAML 3.13\n", - " Uninstalling PyYAML-3.13:\n", - " Successfully uninstalled PyYAML-3.13\n", - "Successfully installed huggingface-hub-0.5.1 pyyaml-6.0 sacremoses-0.0.49 tokenizers-0.12.1 transformers-4.19.0.dev0\n", - "tokenizers 0.12.1\n", - "transformers 4.19.0.dev0\n", - "Collecting nlp==0.2.0\n", - " Downloading nlp-0.2.0-py3-none-any.whl (857 kB)\n", - "\u001b[K |████████████████████████████████| 857 kB 8.9 MB/s \n", - "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (3.6.0)\n", - "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (2.23.0)\n", - "Requirement already satisfied: pyarrow>=0.16.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (6.0.1)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (1.21.6)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (4.64.0)\n", - "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (0.3.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2021.10.8)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (1.24.3)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2.10)\n", - "Installing collected packages: nlp\n", - "Successfully installed nlp-0.2.0\n", - "Collecting datasets\n", - " Downloading datasets-2.1.0-py3-none-any.whl (325 kB)\n", - "\u001b[K |████████████████████████████████| 325 kB 8.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.4)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n", - "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.12.2)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.21.6)\n", - "Collecting fsspec[http]>=2021.05.0\n", - " Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)\n", - "\u001b[K |████████████████████████████████| 136 kB 60.7 MB/s \n", - "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.11.3)\n", - "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.5.1)\n", - "Collecting xxhash\n", - " Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", - "\u001b[K |████████████████████████████████| 212 kB 49.9 MB/s \n", - "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.3.5)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.64.0)\n", - "Collecting aiohttp\n", - " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", - "\u001b[K |████████████████████████████████| 1.1 MB 48.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: pyarrow>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n", - "Collecting responses<0.19\n", - " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.2.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.6.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.8)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", - "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n", - " Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n", - "\u001b[K |████████████████████████████████| 127 kB 59.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.4.0)\n", - "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.12)\n", - "Collecting asynctest==0.13.0\n", - " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", - "Collecting multidict<7.0,>=4.5\n", - " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", - "\u001b[K |████████████████████████████████| 94 kB 3.7 MB/s \n", - "\u001b[?25hCollecting yarl<2.0,>=1.0\n", - " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", - "\u001b[K |████████████████████████████████| 271 kB 57.7 MB/s \n", - "\u001b[?25hCollecting async-timeout<5.0,>=4.0.0a3\n", - " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", - "Collecting frozenlist>=1.1.1\n", - " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", - "\u001b[K |████████████████████████████████| 144 kB 62.5 MB/s \n", - "\u001b[?25hCollecting aiosignal>=1.1.2\n", - " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.8.0)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n", - "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2022.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", - "Installing collected packages: multidict, frozenlist, yarl, urllib3, asynctest, async-timeout, aiosignal, fsspec, aiohttp, xxhash, responses, datasets\n", - " Attempting uninstall: urllib3\n", - " Found existing installation: urllib3 1.24.3\n", - " Uninstalling urllib3-1.24.3:\n", - " Successfully uninstalled urllib3-1.24.3\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "kapre 0.3.7 requires tensorflow>=2.0.0, which is not installed.\n", - "datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\u001b[0m\n", - "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-2.1.0 frozenlist-1.3.0 fsspec-2022.3.0 multidict-6.0.2 responses-0.18.0 urllib3-1.25.11 xxhash-3.0.0 yarl-1.7.2\n", - "Collecting git+https://github.com/huggingface/nlp\n", - " Cloning https://github.com/huggingface/nlp to /tmp/pip-req-build-zp5gwnk_\n", - " Running command git clone -q https://github.com/huggingface/nlp /tmp/pip-req-build-zp5gwnk_\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (1.21.6)\n", - "Requirement already satisfied: pyarrow>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (6.0.1)\n", - "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.3.4)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (1.3.5)\n", - "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (2.23.0)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (4.64.0)\n", - "Requirement already satisfied: xxhash in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (3.0.0)\n", - "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.70.12.2)\n", - "Requirement already satisfied: fsspec[http]>=2021.05.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (2022.3.0)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (3.8.1)\n", - "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.5.1)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (21.3)\n", - "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.18.0)\n", - "Requirement already satisfied: importlib_metadata in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (4.11.3)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (3.6.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (4.2.0)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (6.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets==2.1.1.dev0) (3.0.8)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (2021.10.8)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (1.25.11)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (3.0.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (4.0.2)\n", - "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (2.0.12)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.3.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.2.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (21.4.0)\n", - "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (0.13.0)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.7.2)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (6.0.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib_metadata->datasets==2.1.1.dev0) (3.8.0)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.1.1.dev0) (2.8.2)\n", - "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.1.1.dev0) (2022.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets==2.1.1.dev0) (1.15.0)\n", - "Building wheels for collected packages: datasets\n", - " Building wheel for datasets (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for datasets: filename=datasets-2.1.1.dev0-py3-none-any.whl size=327456 sha256=d7cadb2d89edc35e658adda0904ba4c5a7a20d39e04a3060fbdc3888fc78f67f\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-yre84zb1/wheels/b7/b2/b6/a0b4e0d11cb66d705e54f7bb72fdbe910b5e9f198ada8b4347\n", - "Successfully built datasets\n", - "Installing collected packages: datasets\n", - " Attempting uninstall: datasets\n", - " Found existing installation: datasets 2.1.0\n", - " Uninstalling datasets-2.1.0:\n", - " Successfully uninstalled datasets-2.1.0\n", - "Successfully installed datasets-2.1.1.dev0\n" - ] - } - ], - "source": [ - "# We won't need TensorFlow here\n", - "!pip uninstall -y tensorflow\n", - "# Install `transformers` from master\n", - "!pip install git+https://github.com/huggingface/transformers\n", - "!pip list | grep -E 'transformers|tokenizers'\n", - "!pip install nlp==0.2.0\n", - "!pip install datasets\n", - "!pip install git+https://github.com/huggingface/nlp\n", - "\n", - "# transformers version at notebook update --- 2.11.0\n", - "# tokenizers version at notebook update --- 0.8.0rc1" - ] - }, - { - "cell_type": "markdown", - "source": [ - "Fetch datasets" - ], - "metadata": { - "id": "b4Jowf-vf8Ck" - } - }, - { - "cell_type": "code", - "source": [ - "import os\n", - "import tokenize\n", - "import dis\n", - "import sys\n", - "import re\n", - "import keyword\n", - "import pandas as pd\n", - "import ast\n", - "import torch\n", - "import signal\n", - "from functools import wraps\n", - "\n", - "def multireplace(string, replacements, ignore_case=False):\n", - " \"\"\"\n", - " Given a string and a replacement map, it returns the replaced string.\n", - " :param str string: string to execute replacements on\n", - " :param dict replacements: replacement dictionary {value to find: value to replace}\n", - " :param bool ignore_case: whether the match should be case insensitive\n", - " :rtype: str\n", - " \"\"\"\n", - " # If case insensitive, we need to normalize the old string so that later a replacement\n", - " # can be found. For instance with {\"HEY\": \"lol\"} we should match and find a replacement for \"hey\",\n", - " # \"HEY\", \"hEy\", etc.\n", - " if ignore_case:\n", - " def normalize_old(s):\n", - " return s.lower()\n", - " re_mode = re.IGNORECASE\n", - " else:\n", - " def normalize_old(s):\n", - " return s\n", - " re_mode = 0\n", - "\n", - " replacements = {normalize_old(key): val for key, val in replacements.items()}\n", - " \n", - " # Place longer ones first to keep shorter substrings from matching where the longer ones should take place\n", - " # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce\n", - " # 'hey ABC' and not 'hey ABc'\n", - " rep_sorted = sorted(replacements, key=len, reverse=True)\n", - " rep_escaped = map(re.escape, rep_sorted)\n", - " \n", - " # Create a big OR regex that matches any of the substrings to replace\n", - " pattern = re.compile(\"|\".join(rep_escaped), re_mode)\n", - " \n", - " # For each match, look up the new string in the replacements, being the key the normalized old string\n", - " return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)\n", - "\n", - "\n", - "def convert(file, output_file):\n", - " with open (file, \"r\") as f:\n", - " text = f.read() \n", - "\n", - " replacements = {}\n", - " for node in ast.iter_child_nodes(ast.parse(text)):\n", - " if isinstance(node, ast.ImportFrom):\n", - " replacements.update({node.module: 'MODULE'})\n", - " if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):\n", - " for i, v in enumerate(node.names):\n", - " if(node.names[i].asname):\n", - " replacements.update({node.names[i].name: 'LIB'}) \n", - " replacements.update({node.names[i].asname: 'ALIAS'})\n", - " else:\n", - " replacements.update({node.names[i].name: 'LIBRARY'})\n", - "\n", - "\n", - " # reomve * from the dictionary (handle from module import * statement)\n", - " replacements.pop('*', None)\n", - " print('List of modules and libraries to replace:\\n', replacements)\n", - "\n", - " with open('med.py','w') as f:\n", - " f.write(multireplace(text, replacements, ignore_case = True))\n", - "\n", - " file = 'med.py'\n", - " with open(file,'rb') as f:\n", - " tokens = list(tokenize.tokenize(f.readline))\n", - " \n", - " ### extract important data from the output of tokenize package\n", - " toks = pd.DataFrame(columns = ['original','type','text', 'line','pos'])\n", - "\n", - " last_line = 0\n", - " last_pos = 0\n", - "\n", - " for token in tokens:\n", - " \n", - " tok_org = token.string\n", - " tok_text = token.string \n", - " tok_type = str(token).split('(')[2].split(')')[0]\n", - "\n", - " # convert keywords to upper\n", - " if keyword.iskeyword(tok_text):\n", - " tok_type = str.upper(tok_text)\n", - " \n", - " #extract operations\n", - " # if tok_type == 'OP':\n", - " # tok_type = tok_text\n", - "\n", - "\n", - " # getting rid of comments and empty lines\n", - " if tok_type in ['NL','NEWLINE','COMMENT']:\n", - " continue\n", - " \n", - " #retrieve the position\n", - " tok_line = token.start[0]\n", - " \n", - " if last_line == tok_line:\n", - " last_pos += 1\n", - " else:\n", - " last_pos = 1\n", - " tok_pos = last_pos\n", - " last_line = tok_line\n", - " \n", - " toks = toks.append({'type':tok_type,\n", - " 'original':tok_org,\n", - " 'text':tok_text,\n", - " 'line':tok_line,\n", - " 'pos':tok_pos},ignore_index=True)\n", - "\n", - "\n", - " # remove encoding lines and end of file\n", - " toks.line = toks.line.astype('int')\n", - " toks.pos = toks.pos.astype('int')\n", - " toks = toks.loc[~((toks.type == 'ENCODING') | (toks.type == 'ENDMARKER'))]\n", - " toks['doc'] = (toks.text.str.contains('\"\"\"') | toks.text.str.contains(\"'''\"))\n", - " toks = toks.loc[~(toks.doc)].drop(['doc'],axis=1)\n", - "\n", - " toks.head(20)\n", - "\n", - " indent = 0\n", - " last_line = 0\n", - "\n", - " for index,row in toks.iterrows():\n", - " if row.type == \"INDENT\":\n", - " indent +=1\n", - " continue\n", - " if row.type == \"DEDENT\":\n", - " indent -=1\n", - " continue\n", - " if row.line != last_line:\n", - " last_line = row.line\n", - " toks = toks.append({'type':'\\n'+indent*'\\t',\n", - " 'text':'\\n'+indent*'\\t',\n", - " 'line':row.line,\n", - " 'pos':row.pos-1},ignore_index=True)\n", - "\n", - " toks = toks.loc[~((toks.type=='INDENT') | (toks.type=='DEDENT'))]\n", - " toks = toks.sort_values(['line','pos']).reset_index(drop=True)\n", - "\n", - "\n", - " # drop the first row (empty line)\n", - " toks.drop(toks.index[:1], inplace=True)\n", - "\n", - " toks.head(20)\n", - "\n", - " with open(file,'r') as f:\n", - " src = f.read()\n", - "\n", - " stdout_backup = sys.stdout\n", - " sys.stdout = open('dis.txt','w')\n", - " dis.dis(src)\n", - " sys.stdout = stdout_backup\n", - "\n", - " with open('dis.txt','r') as f:\n", - " lines = f.readlines()\n", - "\n", - " # find global variables\n", - " glbls = [].copy() \n", - " for l in lines:\n", - " clean = l.replace('>>',' ').strip().split()\n", - " if len(clean):\n", - " try:\n", - " int(clean[1])\n", - " line = int(clean[0])\n", - " except:\n", - " clean = [str(line)]+clean\n", - " if 'LOAD_GLOBAL' in clean:\n", - " print('found a global!')\n", - " glbls.append((int(clean[0]),clean[-1].replace('(','').replace(')','')))\n", - "\n", - " for l,n in glbls:\n", - " toks.loc[(toks.line==l) & (toks.text==n),'type'] = 'GLOBAL_VARIABLE'\n", - "\n", - " toks .head(10) \n", - "\n", - " text_imports = ' '.join(list(toks.text)).replace('\\n ','\\n').replace(' \\n','\\n').replace('\\t ','\\t').replace(' . ','.').replace(' (','(')\n", - " text_imports = multireplace(text_imports, replacements, ignore_case = True)\n", - "\n", - " with open('normalized_textual_file.py','w') as f:\n", - " f.write(text_imports)\n", - "\n", - " toks.type = toks.apply(lambda x: x['text'] if str(x['text']) in ['LIBRARY','LIB','ALIAS','MODULE'] else x['type'], axis = 1)\n", - " code_converted = ' '.join(list(toks.type)).replace('\\n ','\\n').replace(' \\n','\\n').replace('\\t ','\\t').replace(' . ','.').replace(' (','(')\n", - "\n", - " final_replacements = {'GLOBAL_VARIABLE(':'FUNCTION_CALL(', \n", - " # 'NAME.NAME':'NAME',\n", - " 'NAME(':'FUNCTION_CALL(',\n", - " 'NAME':'LOCAL_VARIABLE'}\n", - "\n", - " code_converted = multireplace(code_converted, final_replacements, ignore_case = False)\n", - "\n", - " with open(output_file,'w') as f:\n", - " f.write(code_converted)\n", - "\n", - "\n", - "WEIGHT_MATRIX = {\n", - " 'NUMBER' : [1.625, 1.25, 1.125],\n", - " 'NAME' : [1.625, 1.125, 1.5],\n", - " 'LOCAL_VARIABLE' : [1.625, 1.125, 1.5],\n", - " 'FUNCTION_NAME' : [1.625, 1.25, 1.5]\n", - " }\n", - "\n", - "input_file = \"/tmp/input_file.txt\"\n", - "output_file = \"/tmp/output_file.txt\"\n", - "def reranking_layer(outputs, context, tokenizer):\n", - "\n", - " with open(input_file, 'w') as f:\n", - " f.write(context);\n", - " \n", - " convert(file_path=input_file, output_file=output_file)\n", - " with open(output_file, 'rb') as context:\n", - " inputs = list(zip(tokenizer(input_file), tokenizer(output_file)))\n", - " for item in inputs:\n", - " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(WEIGHT_MATRIX[item[1]]))\n", - "\n" - ], - "metadata": { - "id": "PGHNJPreFmOw" - }, - "execution_count": 13, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "convert(\"./sample_data/data/peakfinder.py\", \"./converted_train.txt\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cNqN20xrhkGq", - "outputId": "7e041a8c-eb0a-4648-9de0-3c1ffc6440e6" - }, - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "List of modules and libraries to replace:\n", - " {'pylab': 'MODULE', 'rtlsdr': 'MODULE'}\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# pretrain dataset\n", - "#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/pretrain_dataset.zip\n", - "#!unzip 'pretrain_dataset.zip'\n", - "\n", - "# converted dataset\n", - "#! wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/converted_dataset.zip\n", - "#! unzip 'converted_dataset.zip'\n", - "\n", - "# test dataset\n", - "#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/finetune_eval_dataset.zip\n", - "#!unzip 'finetune_eval_dataset.zip'" - ], - "metadata": { - "id": "sEmqUukXf__k" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Train a customised python byte-level Byte-pair encoding tokenizer. " - ], - "metadata": { - "id": "M0wmpgCxUIF3" - } + { + "cell_type": "markdown", + "source": [ + "Fetch datasets" + ], + "metadata": { + "id": "b4Jowf-vf8Ck" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import tokenize\n", + "import dis\n", + "import sys\n", + "import re\n", + "import keyword\n", + "import pandas as pd\n", + "import ast\n", + "import torch\n", + "import signal\n", + "from functools import wraps\n", + "\n", + "def multireplace(string, replacements, ignore_case=False):\n", + " \"\"\"\n", + " Given a string and a replacement map, it returns the replaced string.\n", + " :param str string: string to execute replacements on\n", + " :param dict replacements: replacement dictionary {value to find: value to replace}\n", + " :param bool ignore_case: whether the match should be case insensitive\n", + " :rtype: str\n", + " \"\"\"\n", + " # If case insensitive, we need to normalize the old string so that later a replacement\n", + " # can be found. For instance with {\"HEY\": \"lol\"} we should match and find a replacement for \"hey\",\n", + " # \"HEY\", \"hEy\", etc.\n", + " if ignore_case:\n", + " def normalize_old(s):\n", + " return s.lower()\n", + " re_mode = re.IGNORECASE\n", + " else:\n", + " def normalize_old(s):\n", + " return s\n", + " re_mode = 0\n", + "\n", + " replacements = {normalize_old(key): val for key, val in replacements.items()}\n", + " \n", + " # Place longer ones first to keep shorter substrings from matching where the longer ones should take place\n", + " # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce\n", + " # 'hey ABC' and not 'hey ABc'\n", + " rep_sorted = sorted(replacements, key=len, reverse=True)\n", + " rep_escaped = map(re.escape, rep_sorted)\n", + " \n", + " # Create a big OR regex that matches any of the substrings to replace\n", + " pattern = re.compile(\"|\".join(rep_escaped), re_mode)\n", + " \n", + " # For each match, look up the new string in the replacements, being the key the normalized old string\n", + " return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)\n", + "\n", + "\n", + "def convert(file, output_file):\n", + " with open (file, \"r\") as f:\n", + " text = f.read() \n", + "\n", + " replacements = {}\n", + " for node in ast.iter_child_nodes(ast.parse(text)):\n", + " if isinstance(node, ast.ImportFrom):\n", + " replacements.update({node.module: 'MODULE'})\n", + " if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):\n", + " for i, v in enumerate(node.names):\n", + " if(node.names[i].asname):\n", + " replacements.update({node.names[i].name: 'LIB'}) \n", + " replacements.update({node.names[i].asname: 'ALIAS'})\n", + " else:\n", + " replacements.update({node.names[i].name: 'LIBRARY'})\n", + "\n", + "\n", + " # reomve * from the dictionary (handle from module import * statement)\n", + " replacements.pop('*', None)\n", + " print('List of modules and libraries to replace:\\n', replacements)\n", + "\n", + " with open('med.py','w') as f:\n", + " f.write(multireplace(text, replacements, ignore_case = True))\n", + "\n", + " file = 'med.py'\n", + " with open(file,'rb') as f:\n", + " tokens = list(tokenize.tokenize(f.readline))\n", + " \n", + " ### extract important data from the output of tokenize package\n", + " toks = pd.DataFrame(columns = ['original','type','text', 'line','pos'])\n", + "\n", + " last_line = 0\n", + " last_pos = 0\n", + "\n", + " for token in tokens:\n", + " \n", + " tok_org = token.string\n", + " tok_text = token.string \n", + " tok_type = str(token).split('(')[2].split(')')[0]\n", + "\n", + " # convert keywords to upper\n", + " if keyword.iskeyword(tok_text):\n", + " tok_type = str.upper(tok_text)\n", + " \n", + " #extract operations\n", + " # if tok_type == 'OP':\n", + " # tok_type = tok_text\n", + "\n", + "\n", + " # getting rid of comments and empty lines\n", + " if tok_type in ['NL','NEWLINE','COMMENT']:\n", + " continue\n", + " \n", + " #retrieve the position\n", + " tok_line = token.start[0]\n", + " \n", + " if last_line == tok_line:\n", + " last_pos += 1\n", + " else:\n", + " last_pos = 1\n", + " tok_pos = last_pos\n", + " last_line = tok_line\n", + " \n", + " toks = toks.append({'type':tok_type,\n", + " 'original':tok_org,\n", + " 'text':tok_text,\n", + " 'line':tok_line,\n", + " 'pos':tok_pos},ignore_index=True)\n", + "\n", + "\n", + " # remove encoding lines and end of file\n", + " toks.line = toks.line.astype('int')\n", + " toks.pos = toks.pos.astype('int')\n", + " toks = toks.loc[~((toks.type == 'ENCODING') | (toks.type == 'ENDMARKER'))]\n", + " toks['doc'] = (toks.text.str.contains('\"\"\"') | toks.text.str.contains(\"'''\"))\n", + " toks = toks.loc[~(toks.doc)].drop(['doc'],axis=1)\n", + "\n", + " toks.head(20)\n", + "\n", + " indent = 0\n", + " last_line = 0\n", + "\n", + " for index,row in toks.iterrows():\n", + " if row.type == \"INDENT\":\n", + " indent +=1\n", + " continue\n", + " if row.type == \"DEDENT\":\n", + " indent -=1\n", + " continue\n", + " if row.line != last_line:\n", + " last_line = row.line\n", + " toks = toks.append({'type':'\\n'+indent*'\\t',\n", + " 'text':'\\n'+indent*'\\t',\n", + " 'line':row.line,\n", + " 'pos':row.pos-1},ignore_index=True)\n", + "\n", + " toks = toks.loc[~((toks.type=='INDENT') | (toks.type=='DEDENT'))]\n", + " toks = toks.sort_values(['line','pos']).reset_index(drop=True)\n", + "\n", + "\n", + " # drop the first row (empty line)\n", + " toks.drop(toks.index[:1], inplace=True)\n", + "\n", + " toks.head(20)\n", + "\n", + " with open(file,'r') as f:\n", + " src = f.read()\n", + "\n", + " stdout_backup = sys.stdout\n", + " sys.stdout = open('dis.txt','w')\n", + " dis.dis(src)\n", + " sys.stdout = stdout_backup\n", + "\n", + " with open('dis.txt','r') as f:\n", + " lines = f.readlines()\n", + "\n", + " # find global variables\n", + " glbls = [].copy() \n", + " for l in lines:\n", + " clean = l.replace('>>',' ').strip().split()\n", + " if len(clean):\n", + " try:\n", + " int(clean[1])\n", + " line = int(clean[0])\n", + " except:\n", + " clean = [str(line)]+clean\n", + " if 'LOAD_GLOBAL' in clean:\n", + " print('found a global!')\n", + " glbls.append((int(clean[0]),clean[-1].replace('(','').replace(')','')))\n", + "\n", + " for l,n in glbls:\n", + " toks.loc[(toks.line==l) & (toks.text==n),'type'] = 'GLOBAL_VARIABLE'\n", + "\n", + " toks .head(10) \n", + "\n", + " text_imports = ' '.join(list(toks.text)).replace('\\n ','\\n').replace(' \\n','\\n').replace('\\t ','\\t').replace(' . ','.').replace(' (','(')\n", + " text_imports = multireplace(text_imports, replacements, ignore_case = True)\n", + "\n", + " with open('normalized_textual_file.py','w') as f:\n", + " f.write(text_imports)\n", + "\n", + " toks.type = toks.apply(lambda x: x['text'] if str(x['text']) in ['LIBRARY','LIB','ALIAS','MODULE'] else x['type'], axis = 1)\n", + " code_converted = ' '.join(list(toks.type)).replace('\\n ','\\n').replace(' \\n','\\n').replace('\\t ','\\t').replace(' . ','.').replace(' (','(')\n", + "\n", + " final_replacements = {'GLOBAL_VARIABLE(':'FUNCTION_CALL(', \n", + " # 'NAME.NAME':'NAME',\n", + " 'NAME(':'FUNCTION_CALL(',\n", + " 'NAME':'LOCAL_VARIABLE'}\n", + "\n", + " code_converted = multireplace(code_converted, final_replacements, ignore_case = False)\n", + "\n", + " with open(output_file,'w') as f:\n", + " f.write(code_converted)\n", + "\n", + "\n", + "WEIGHT_MATRIX = {\n", + " 'NUMBER' : [1.625, 1.25, 1.125],\n", + " 'NAME' : [1.625, 1.125, 1.5],\n", + " 'LOCAL_VARIABLE' : [1.625, 1.125, 1.5],\n", + " 'FUNCTION_NAME' : [1.625, 1.25, 1.5]\n", + " }\n", + "\n", + "input_file = \"/tmp/input_file.txt\"\n", + "output_file = \"/tmp/output_file.txt\"\n", + "def reranking_layer(outputs, context, tokenizer):\n", + "\n", + " with open(input_file, 'w') as f:\n", + " f.write(context);\n", + " \n", + " convert(file_path=input_file, output_file=output_file)\n", + " with open(output_file, 'rb') as context:\n", + " inputs = list(zip(tokenizer(input_file), tokenizer(output_file)))\n", + " for item in inputs:\n", + " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(WEIGHT_MATRIX[item[1]]))\n", + "\n" + ], + "metadata": { + "id": "PGHNJPreFmOw" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "convert(\"./sample_data/data/peakfinder.py\", \"./converted_train.txt\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "cNqN20xrhkGq", + "outputId": "7e041a8c-eb0a-4648-9de0-3c1ffc6440e6" + }, + "execution_count": 14, + "outputs": [ { - "cell_type": "code", - "source": [ - "from pathlib import Path\n", - "from transformers import AutoTokenizer,TextDataset,DataCollatorForLanguageModeling\n", - "import glob\n", - "import random \n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n" - ], - "metadata": { - "id": "oPq1Bau8UbpB" - }, - "execution_count": 5, - "outputs": [] + "output_type": "stream", + "name": "stdout", + "text": [ + "List of modules and libraries to replace:\n", + " {'pylab': 'MODULE', 'rtlsdr': 'MODULE'}\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# pretrain dataset\n", + "#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/pretrain_dataset.zip\n", + "#!unzip 'pretrain_dataset.zip'\n", + "\n", + "# converted dataset\n", + "#! wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/converted_dataset.zip\n", + "#! unzip 'converted_dataset.zip'\n", + "\n", + "# test dataset\n", + "#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/finetune_eval_dataset.zip\n", + "#!unzip 'finetune_eval_dataset.zip'" + ], + "metadata": { + "id": "sEmqUukXf__k" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Train a customised python byte-level Byte-pair encoding tokenizer. " + ], + "metadata": { + "id": "M0wmpgCxUIF3" + } + }, + { + "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from transformers import AutoTokenizer,TextDataset,DataCollatorForLanguageModeling\n", + "import glob\n", + "import random \n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n" + ], + "metadata": { + "id": "oPq1Bau8UbpB" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "paths = [str(x) for x in Path(\".\").glob(\"./sample_data/data/*.py\")]\n", + "converted_paths = []\n", + "for path in paths:\n", + " converted_path = \"./sample_data/converted/\"+ path.split(\"/\").pop().split(\".\")[0] + \".txt\"\n", + " print(converted_path)\n", + " try:\n", + " convert(path, converted_path)\n", + " converted_paths.append(converted_path)\n", + " except:\n", + " pass\n", + "\n", + " \n", + "with open(\"./train.txt\", \"wb\") as train_outfile:\n", + " with open(\"./test.txt\", \"wb\") as test_outfile:\n", + " for f in paths:\n", + " choice = random.random()\n", + " with open(f, \"rb\") as infile:\n", + " if choice > 0.1:\n", + " train_outfile.write(infile.read())\n", + " else:\n", + " test_outfile.write(infile.read())\n", + "\n", + "with open(\"./converted_train.txt\", \"wb\") as train_outfile:\n", + " with open(\"./converted_test.txt\", \"wb\") as test_outfile:\n", + " for f in converted_paths:\n", + " choice = random.random()\n", + " with open(f, \"rb\") as infile:\n", + " if choice > 0.1:\n", + " train_outfile.write(infile.read())\n", + " else:\n", + " test_outfile.write(infile.read())\n" + ], + "metadata": { + "id": "vOXkK5bWYNXz", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "768058fc-877f-4d58-bea9-59933c15e9ad" + }, + "execution_count": 26, + "outputs": [ { - "cell_type": "code", - "source": [ - "paths = [str(x) for x in Path(\".\").glob(\"./sample_data/data/*.py\")]\n", - "converted_paths = []\n", - "for path in paths:\n", - " converted_path = \"./sample_data/converted/\"+ path.split(\"/\").pop().split(\".\")[0] + \".txt\"\n", - " print(converted_path)\n", - " try:\n", - " convert(path, converted_path)\n", - " converted_paths.append(converted_path)\n", - " except:\n", - " pass\n", - "\n", - " \n", - "with open(\"./train.txt\", \"wb\") as train_outfile:\n", - " with open(\"./test.txt\", \"wb\") as test_outfile:\n", - " for f in paths:\n", - " choice = random.random()\n", - " with open(f, \"rb\") as infile:\n", - " if choice > 0.1:\n", - " train_outfile.write(infile.read())\n", - " else:\n", - " test_outfile.write(infile.read())\n", - "\n", - "with open(\"./converted_train.txt\", \"wb\") as train_outfile:\n", - " with open(\"./converted_test.txt\", \"wb\") as test_outfile:\n", - " for f in converted_paths:\n", - " choice = random.random()\n", - " with open(f, \"rb\") as infile:\n", - " if choice > 0.1:\n", - " train_outfile.write(infile.read())\n", - " else:\n", - " test_outfile.write(infile.read())\n" - ], - "metadata": { - "id": "vOXkK5bWYNXz", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "768058fc-877f-4d58-bea9-59933c15e9ad" - }, - "execution_count": 26, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "./sample_data/converted/peakfinder.txt\n", - "List of modules and libraries to replace:\n", - " {'pylab': 'MODULE', 'rtlsdr': 'MODULE'}\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "./sample_data/converted/peakfinder.txt\n", + "List of modules and libraries to replace:\n", + " {'pylab': 'MODULE', 'rtlsdr': 'MODULE'}\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def load_dataset(train_path,test_path,tokenizer):\n", + " train_dataset = TextDataset(\n", + " tokenizer=tokenizer,\n", + " file_path=train_path,\n", + " block_size=128)\n", + " \n", + " test_dataset = TextDataset(\n", + " tokenizer=tokenizer,\n", + " file_path=test_path,\n", + " block_size=128) \n", + " \n", + " data_collator = DataCollatorForLanguageModeling(\n", + " tokenizer=tokenizer, mlm=False,\n", + " )\n", + " return train_dataset,test_dataset,data_collator\n", + "\n", + "train_dataset,test_dataset,data_collator = load_dataset(\"./train.txt\", \"./test.txt\",tokenizer)\n", + "converted_train_dataset, converted_test_dataset, converted_datacollator = load_dataset(\"./converted_train.txt\", \"./converted_test.txt\",tokenizer)\n", + "#pretrain_raw_files = glob.glob(\"./pretrain_dataset\" + '/**/*.py', recursive=True)\n", + "#pretrain_converted_files = glob.glob(\"./pretrain_converted_dataset\" + '/**/*.py', recursive=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "5Q4kLrTZXTjZ", + "outputId": "15e23b41-9245-454f-d9d6-c169eae0f943" + }, + "execution_count": 16, + "outputs": [ { - "cell_type": "code", - "source": [ - "def load_dataset(train_path,test_path,tokenizer):\n", - " train_dataset = TextDataset(\n", - " tokenizer=tokenizer,\n", - " file_path=train_path,\n", - " block_size=128)\n", - " \n", - " test_dataset = TextDataset(\n", - " tokenizer=tokenizer,\n", - " file_path=test_path,\n", - " block_size=128) \n", - " \n", - " data_collator = DataCollatorForLanguageModeling(\n", - " tokenizer=tokenizer, mlm=False,\n", - " )\n", - " return train_dataset,test_dataset,data_collator\n", - "\n", - "train_dataset,test_dataset,data_collator = load_dataset(\"./train.txt\", \"./test.txt\",tokenizer)\n", - "converted_train_dataset, converted_test_dataset, converted_datacollator = load_dataset(\"./converted_train.txt\", \"./converted_test.txt\",tokenizer)\n", - "#pretrain_raw_files = glob.glob(\"./pretrain_dataset\" + '/**/*.py', recursive=True)\n", - "#pretrain_converted_files = glob.glob(\"./pretrain_converted_dataset\" + '/**/*.py', recursive=True)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5Q4kLrTZXTjZ", - "outputId": "15e23b41-9245-454f-d9d6-c169eae0f943" - }, - "execution_count": 16, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/transformers/data/datasets/language_modeling.py:58: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n", - " FutureWarning,\n", - "Token indices sequence length is longer than the specified maximum sequence length for this model (1381 > 1024). Running this sequence through the model will result in indexing errors\n" - ] - } - ] + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/transformers/data/datasets/language_modeling.py:58: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n", + " FutureWarning,\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (1381 > 1024). Running this sequence through the model will result in indexing errors\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "tokenizer(\"for i in range(10)\")[\"input_ids\"]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "0AraoltupXmb", + "outputId": "c90124e7-3695-44af-827d-c355632ec2af" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "tokenizer(\"for i in range(10)\")[\"input_ids\"]" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0AraoltupXmb", - "outputId": "c90124e7-3695-44af-827d-c355632ec2af" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[1640, 1312, 287, 2837, 7, 940, 8]" - ] - }, - "metadata": {}, - "execution_count": 10 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + "[1640, 1312, 287, 2837, 7, 940, 8]" ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import transformers\n", + "import nlp\n", + "import logging\n", + "from datasets import load_dataset\n", + "from transformers import TextDataset,DataCollatorForLanguageModeling\n", + "\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "dataset_dict = {\n", + " \"token\": train_dataset,\n", + " \"token_type\": train_dataset,\n", + " \"line\": train_dataset,\n", + "}\n", + "\n", + "print(dataset_dict[\"token\"])\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "4ePqOauaqjnZ", + "outputId": "b6959f0d-812a-4cf2-95e4-2718a5be58bd" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import transformers\n", - "import nlp\n", - "import logging\n", - "from datasets import load_dataset\n", - "from transformers import TextDataset,DataCollatorForLanguageModeling\n", - "\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "\n", - "dataset_dict = {\n", - " \"token\": train_dataset,\n", - " \"token_type\": train_dataset,\n", - " \"line\": train_dataset,\n", - "}\n", - "\n", - "print(dataset_dict[\"token\"])\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4ePqOauaqjnZ", - "outputId": "b6959f0d-812a-4cf2-95e4-2718a5be58bd" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from transformers.utils.dummy_pt_objects import GPT2LMHeadModel\n", + "from transformers import GPT2Tokenizer\n", + "from transformers import GPT2Config, EncoderDecoderConfig, EncoderDecoderModel\n", + "\n", + "\n", + "class MultitaskModel(transformers.PreTrainedModel):\n", + " def __init__(self, encoder, taskmodels_dict):\n", + " \"\"\"\n", + " Setting MultitaskModel up as a PretrainedModel allows us\n", + " to take better advantage of Trainer features\n", + " \"\"\"\n", + " super().__init__(transformers.PretrainedConfig())\n", + "\n", + " self.encoder = encoder\n", + " self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)\n", + "\n", + " def _get_models(self):\n", + " return self.taskmodels_dict\n", + "\n", + " @classmethod\n", + " def create(cls, model_name, model_type_dict, model_config_dict):\n", + " \"\"\"\n", + " This creates a MultitaskModel using the model class and config objects\n", + " from single-task models. \n", + "\n", + " We do this by creating each single-task model, and having them share\n", + " the same encoder transformer.\n", + " \"\"\"\n", + " shared_encoder = None\n", + " taskmodels_dict = {}\n", + " for task_name, model_type in model_type_dict.items():\n", + " model = model_type.from_pretrained( \"gpt2\",\n", + " config=model_config_dict[task_name],\n", + " )\n", + " if shared_encoder is None:\n", + " shared_encoder = cls.get_encoder(model)\n", + " else:\n", + " setattr(model, \"encoder\", shared_encoder)\n", + " taskmodels_dict[task_name] = model\n", + " return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)\n", + " \n", + "\n", + " @classmethod\n", + " def get_encoder(cls, model):\n", + " \"\"\"\n", + " The encoder transformer is named differently in each model \"architecture\".\n", + " This method lets us get the name of the encoder attribute\n", + " \"\"\"\n", + " model_class_name = model.__class__.__name__\n", + " if model_class_name.startswith(\"Roberta\"):\n", + " return \"roberta-base\"\n", + " elif model_class_name.startswith(\"GPT2\"):\n", + " config = EncoderDecoderConfig.from_encoder_decoder_configs(model.config, model.config) \n", + " encoder_decoder = EncoderDecoderModel(config=config)\n", + " return encoder_decoder.config.encoder\n", + " else:\n", + " raise KeyError(f\"Add support for new model {model_class_name}\")\n", + " \n", + " def forward(self, task_name, **kwargs):\n", + " return self.taskmodels_dict[task_name](**kwargs)" + ], + "metadata": { + "id": "jJLxsM_H4A6z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model_name = \"gpt2\"\n", + "multitask_model = MultitaskModel.create(\n", + " model_name=model_name,\n", + " model_type_dict={\n", + " \"token\": transformers.AutoModelWithLMHead,\n", + " \"token_type\": transformers.AutoModelWithLMHead,\n", + " \"line\": transformers.AutoModelForSequenceClassification,\n", + " },\n", + " model_config_dict={\n", + " \"token\": transformers.AutoConfig.from_pretrained(model_name),\n", + " \"token_type\": transformers.AutoConfig.from_pretrained(model_name),\n", + " \"line\": transformers.AutoConfig.from_pretrained(model_name),\n", + " },\n", + ")" + ], + "metadata": { + "id": "9bbwa7E74Q0x", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "4aa35f34-7709-464a-8c6c-bc966a7f3be9" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "from transformers.utils.dummy_pt_objects import GPT2LMHeadModel\n", - "from transformers import GPT2Tokenizer\n", - "from transformers import GPT2Config, EncoderDecoderConfig, EncoderDecoderModel\n", - "\n", - "\n", - "class MultitaskModel(transformers.PreTrainedModel):\n", - " def __init__(self, encoder, taskmodels_dict):\n", - " \"\"\"\n", - " Setting MultitaskModel up as a PretrainedModel allows us\n", - " to take better advantage of Trainer features\n", - " \"\"\"\n", - " super().__init__(transformers.PretrainedConfig())\n", - "\n", - " self.encoder = encoder\n", - " self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)\n", - "\n", - " def _get_models(self):\n", - " return self.taskmodels_dict\n", - "\n", - " @classmethod\n", - " def create(cls, model_name, model_type_dict, model_config_dict):\n", - " \"\"\"\n", - " This creates a MultitaskModel using the model class and config objects\n", - " from single-task models. \n", - "\n", - " We do this by creating each single-task model, and having them share\n", - " the same encoder transformer.\n", - " \"\"\"\n", - " shared_encoder = None\n", - " taskmodels_dict = {}\n", - " for task_name, model_type in model_type_dict.items():\n", - " model = model_type.from_pretrained( \"gpt2\",\n", - " config=model_config_dict[task_name],\n", - " )\n", - " if shared_encoder is None:\n", - " shared_encoder = cls.get_encoder(model)\n", - " else:\n", - " setattr(model, \"encoder\", shared_encoder)\n", - " taskmodels_dict[task_name] = model\n", - " return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)\n", - " \n", - "\n", - " @classmethod\n", - " def get_encoder(cls, model):\n", - " \"\"\"\n", - " The encoder transformer is named differently in each model \"architecture\".\n", - " This method lets us get the name of the encoder attribute\n", - " \"\"\"\n", - " model_class_name = model.__class__.__name__\n", - " if model_class_name.startswith(\"Roberta\"):\n", - " return \"roberta-base\"\n", - " elif model_class_name.startswith(\"GPT2\"):\n", - " config = EncoderDecoderConfig.from_encoder_decoder_configs(model.config, model.config) \n", - " encoder_decoder = EncoderDecoderModel(config=config)\n", - " return encoder_decoder.config.encoder\n", - " else:\n", - " raise KeyError(f\"Add support for new model {model_class_name}\")\n", - " \n", - " def forward(self, task_name, **kwargs):\n", - " return self.taskmodels_dict[task_name](**kwargs)" - ], - "metadata": { - "id": "jJLxsM_H4A6z" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stderr", + "text": [ + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.19.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.19.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.19.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:915: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", + " FutureWarning,\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", + "\n", + "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", + "Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", + "\n", + "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2ForSequenceClassification.\n", + "\n", + "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Check that we have a GPU\n", + "!nvidia-smi\n", + "# Check that PyTorch sees it\n", + "import torch\n", + "torch.cuda.is_available()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "CNu7NKoYpvCP", + "outputId": "ae4d73f1-4657-4580-b85a-bfd1410bee7b" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "model_name = \"gpt2\"\n", - "multitask_model = MultitaskModel.create(\n", - " model_name=model_name,\n", - " model_type_dict={\n", - " \"token\": transformers.AutoModelWithLMHead,\n", - " \"token_type\": transformers.AutoModelWithLMHead,\n", - " \"line\": transformers.AutoModelForSequenceClassification,\n", - " },\n", - " model_config_dict={\n", - " \"token\": transformers.AutoConfig.from_pretrained(model_name),\n", - " \"token_type\": transformers.AutoConfig.from_pretrained(model_name),\n", - " \"line\": transformers.AutoConfig.from_pretrained(model_name),\n", - " },\n", - ")" - ], - "metadata": { - "id": "9bbwa7E74Q0x", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "4aa35f34-7709-464a-8c6c-bc966a7f3be9" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", - "Model config GPT2Config {\n", - " \"_name_or_path\": \"gpt2\",\n", - " \"activation_function\": \"gelu_new\",\n", - " \"architectures\": [\n", - " \"GPT2LMHeadModel\"\n", - " ],\n", - " \"attn_pdrop\": 0.1,\n", - " \"bos_token_id\": 50256,\n", - " \"embd_pdrop\": 0.1,\n", - " \"eos_token_id\": 50256,\n", - " \"initializer_range\": 0.02,\n", - " \"layer_norm_epsilon\": 1e-05,\n", - " \"model_type\": \"gpt2\",\n", - " \"n_ctx\": 1024,\n", - " \"n_embd\": 768,\n", - " \"n_head\": 12,\n", - " \"n_inner\": null,\n", - " \"n_layer\": 12,\n", - " \"n_positions\": 1024,\n", - " \"reorder_and_upcast_attn\": false,\n", - " \"resid_pdrop\": 0.1,\n", - " \"scale_attn_by_inverse_layer_idx\": false,\n", - " \"scale_attn_weights\": true,\n", - " \"summary_activation\": null,\n", - " \"summary_first_dropout\": 0.1,\n", - " \"summary_proj_to_labels\": true,\n", - " \"summary_type\": \"cls_index\",\n", - " \"summary_use_proj\": true,\n", - " \"task_specific_params\": {\n", - " \"text-generation\": {\n", - " \"do_sample\": true,\n", - " \"max_length\": 50\n", - " }\n", - " },\n", - " \"transformers_version\": \"4.19.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 50257\n", - "}\n", - "\n", - "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", - "Model config GPT2Config {\n", - " \"_name_or_path\": \"gpt2\",\n", - " \"activation_function\": \"gelu_new\",\n", - " \"architectures\": [\n", - " \"GPT2LMHeadModel\"\n", - " ],\n", - " \"attn_pdrop\": 0.1,\n", - " \"bos_token_id\": 50256,\n", - " \"embd_pdrop\": 0.1,\n", - " \"eos_token_id\": 50256,\n", - " \"initializer_range\": 0.02,\n", - " \"layer_norm_epsilon\": 1e-05,\n", - " \"model_type\": \"gpt2\",\n", - " \"n_ctx\": 1024,\n", - " \"n_embd\": 768,\n", - " \"n_head\": 12,\n", - " \"n_inner\": null,\n", - " \"n_layer\": 12,\n", - " \"n_positions\": 1024,\n", - " \"reorder_and_upcast_attn\": false,\n", - " \"resid_pdrop\": 0.1,\n", - " \"scale_attn_by_inverse_layer_idx\": false,\n", - " \"scale_attn_weights\": true,\n", - " \"summary_activation\": null,\n", - " \"summary_first_dropout\": 0.1,\n", - " \"summary_proj_to_labels\": true,\n", - " \"summary_type\": \"cls_index\",\n", - " \"summary_use_proj\": true,\n", - " \"task_specific_params\": {\n", - " \"text-generation\": {\n", - " \"do_sample\": true,\n", - " \"max_length\": 50\n", - " }\n", - " },\n", - " \"transformers_version\": \"4.19.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 50257\n", - "}\n", - "\n", - "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", - "Model config GPT2Config {\n", - " \"_name_or_path\": \"gpt2\",\n", - " \"activation_function\": \"gelu_new\",\n", - " \"architectures\": [\n", - " \"GPT2LMHeadModel\"\n", - " ],\n", - " \"attn_pdrop\": 0.1,\n", - " \"bos_token_id\": 50256,\n", - " \"embd_pdrop\": 0.1,\n", - " \"eos_token_id\": 50256,\n", - " \"initializer_range\": 0.02,\n", - " \"layer_norm_epsilon\": 1e-05,\n", - " \"model_type\": \"gpt2\",\n", - " \"n_ctx\": 1024,\n", - " \"n_embd\": 768,\n", - " \"n_head\": 12,\n", - " \"n_inner\": null,\n", - " \"n_layer\": 12,\n", - " \"n_positions\": 1024,\n", - " \"reorder_and_upcast_attn\": false,\n", - " \"resid_pdrop\": 0.1,\n", - " \"scale_attn_by_inverse_layer_idx\": false,\n", - " \"scale_attn_weights\": true,\n", - " \"summary_activation\": null,\n", - " \"summary_first_dropout\": 0.1,\n", - " \"summary_proj_to_labels\": true,\n", - " \"summary_type\": \"cls_index\",\n", - " \"summary_use_proj\": true,\n", - " \"task_specific_params\": {\n", - " \"text-generation\": {\n", - " \"do_sample\": true,\n", - " \"max_length\": 50\n", - " }\n", - " },\n", - " \"transformers_version\": \"4.19.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 50257\n", - "}\n", - "\n", - "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:915: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", - " FutureWarning,\n", - "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", - "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", - "\n", - "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", - "Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config\n", - "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", - "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", - "\n", - "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", - "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", - "All model checkpoint weights were used when initializing GPT2ForSequenceClassification.\n", - "\n", - "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.\n", + "\n" + ] }, { - "cell_type": "code", - "source": [ - "# Check that we have a GPU\n", - "!nvidia-smi\n", - "# Check that PyTorch sees it\n", - "import torch\n", - "torch.cuda.is_available()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CNu7NKoYpvCP", - "outputId": "ae4d73f1-4657-4580-b85a-bfd1410bee7b" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.\n", - "\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "False" - ] - }, - "metadata": {}, - "execution_count": 35 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + "False" ] + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import dataclasses\n", + "from torch.utils.data.dataloader import DataLoader\n", + "from transformers.data.data_collator import DataCollatorForLanguageModeling, InputDataClass, DefaultDataCollator\n", + "from torch.utils.data.distributed import DistributedSampler\n", + "from torch.utils.data.sampler import RandomSampler\n", + "from typing import List, Union, Dict\n", + "from transformers import Trainer\n", + "from random import random\n", + "\n", + "\n", + "class NLPDataCollator(DataCollatorForLanguageModeling):\n", + " \"\"\"\n", + " Extending the existing DataCollator to work with NLP dataset batches\n", + " \"\"\"\n", + " def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:\n", + " first = features[0]\n", + " if isinstance(first, dict):\n", + " # NLP data sets current works presents features as lists of dictionary\n", + " # (one per example), so we will adapt the collate_batch logic for that\n", + " if \"labels\" in first and first[\"labels\"] is not None:\n", + " if first[\"labels\"].dtype == torch.int64:\n", + " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.long)\n", + " else:\n", + " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.float)\n", + " batch = {\"labels\": labels}\n", + " for k, v in first.items():\n", + " if k != \"labels\" and v is not None and not isinstance(v, str):\n", + " batch[k] = torch.stack([f[k] for f in features])\n", + " return batch\n", + " else:\n", + " # otherwise, revert to using the default collate_batch\n", + " return DefaultDataCollator().collate_batch(features)\n", + "\n", + "\n", + "class StrIgnoreDevice(str):\n", + " \"\"\"\n", + " This is a hack. The Trainer is going call .to(device) on every input\n", + " value, but we need to pass in an additional `task_name` string.\n", + " This prevents it from throwing an error\n", + " \"\"\"\n", + " def to(self, device):\n", + " return self\n", + "\n", + "class DataLoaderWithTaskname:\n", + " \"\"\"\n", + " Wrapper around a DataLoader to also yield a task name\n", + " \"\"\"\n", + " def __init__(self, task_name, data_loader):\n", + " self.task_name = task_name\n", + " self.data_loader = data_loader\n", + "\n", + " self.batch_size = data_loader.batch_size\n", + " self.dataset = data_loader.dataset\n", + "\n", + " def __len__(self):\n", + " return len(self.data_loader)\n", + " \n", + " def __iter__(self):\n", + " for batch in self.data_loader:\n", + " batch[\"task_name\"] = StrIgnoreDevice(self.task_name)\n", + " yield batch\n", + "\n", + "\n", + "class MultitaskDataloader:\n", + " \"\"\"\n", + " Data loader that combines and samples from multiple single-task\n", + " data loaders.\n", + " \"\"\"\n", + " def __init__(self, dataloader_dict):\n", + " self.dataloader_dict = dataloader_dict\n", + " self.num_batches_dict = {\n", + " task_name: len(dataloader) \n", + " for task_name, dataloader in self.dataloader_dict.items()\n", + " }\n", + " self.task_name_list = list(self.dataloader_dict)\n", + " self.dataset = [None] * sum(\n", + " len(dataloader.dataset) \n", + " for dataloader in self.dataloader_dict.values()\n", + " )\n", + "\n", + " def __len__(self):\n", + " return sum(self.num_batches_dict.values())\n", + "\n", + " def __iter__(self):\n", + " \"\"\"\n", + " For each batch, sample a task, and yield a batch from the respective\n", + " task Dataloader.\n", + "\n", + " We use size-proportional sampling, but you could easily modify this\n", + " to sample from some-other distribution.\n", + " \"\"\"\n", + " task_choice_list = []\n", + " for i, task_name in enumerate(self.task_name_list):\n", + " task_choice_list += [i] * self.num_batches_dict[task_name]\n", + " task_choice_list = np.array(task_choice_list)\n", + " np.random.shuffle(task_choice_list)\n", + " dataloader_iter_dict = {\n", + " task_name: iter(dataloader) \n", + " for task_name, dataloader in self.dataloader_dict.items()\n", + " }\n", + " for task_choice in task_choice_list:\n", + " task_name = self.task_name_list[task_choice]\n", + " yield next(dataloader_iter_dict[task_name]) \n", + "\n", + "class MultitaskTrainer(transformers.Trainer):\n", + "\n", + " def get_single_train_dataloader(self, task_name, train_dataset):\n", + " \"\"\"\n", + " Create a single-task data loader that also yields task names\n", + " \"\"\"\n", + " if self.train_dataset is None:\n", + " raise ValueError(\"Trainer: training requires a train_dataset.\")\n", + " \n", + " train_sampler = (\n", + " RandomSampler(train_dataset)\n", + " if self.args.local_rank == -1\n", + " else DistributedSampler(train_dataset)\n", + " )\n", + "\n", + " data_loader = DataLoaderWithTaskname(\n", + " task_name=task_name,\n", + " data_loader=DataLoader(\n", + " train_dataset,\n", + " batch_size=self.args.train_batch_size,\n", + " sampler=train_sampler\n", + " ),\n", + " )\n", + "\n", + " return data_loader\n", + "\n", + " def get_train_dataloader(self):\n", + " \"\"\"\n", + " Returns a MultitaskDataloader, which is not actually a Dataloader\n", + " but an iterable that returns a generator that samples from each \n", + " task Dataloader\n", + " \"\"\"\n", + " return MultitaskDataloader({\n", + " task_name: self.get_single_train_dataloader(task_name, task_dataset)\n", + " for task_name, task_dataset in self.train_dataset.items()\n", + " })\n", + " \n", + " def train(self):\n", + " config = transformers.AutoConfig.from_pretrained(\"gpt2\")\n", + " model = transformers.AutoModelWithLMHead.from_pretrained(\"gpt2\", config=config)\n", + " self.trainer = Trainer(\n", + " model=model,\n", + " args=transformers.TrainingArguments(\n", + " output_dir=\"./models/multitask_model\",\n", + " overwrite_output_dir=True,\n", + " learning_rate=1e-5,\n", + " do_train=True,\n", + " num_train_epochs=100,\n", + " # Adjust batch size if this doesn't fit on the Colab GPU\n", + " per_device_train_batch_size=8, \n", + " save_steps=3000,\n", + " ),\n", + " data_collator=data_collator,\n", + " train_dataset=train_dataset,\n", + " )\n", + " self.trainer.train()\n", + "\n", + " def prediction_loop(self):\n", + " return self.trainer.predict()\n", + "\n", + " def compute_loss(self, model, inputs, return_outputs=True):\n", + " labels = inputs.get(\"labels\")\n", + " # forward pass\n", + " outputs = model(**inputs)\n", + " reranking_layer(outputs, inputs._get_value(), tokenizer=tokenizer) #input value is tensor\n", + " logits = outputs.get(\"logits\")\n", + " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))\n", + " loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))\n", + " return (loss, outputs) if return_outputs else loss" + ], + "metadata": { + "id": "dzVHNee8EP-T" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "tzNxUL7gLsYV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "trainer = MultitaskTrainer(\n", + " model=multitask_model,\n", + " args=transformers.TrainingArguments(\n", + " output_dir=\"./models/multitask_model\",\n", + " overwrite_output_dir=True,\n", + " learning_rate=1e-5,\n", + " do_train=True,\n", + " num_train_epochs=100,\n", + " # Adjust batch size if this doesn't fit on the Colab GPU\n", + " per_device_train_batch_size=8, \n", + " save_steps=3000,\n", + " ),\n", + " data_collator=data_collator,\n", + ")\n", + "trainer.train()\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "id": "TJ3CrZfgHGKq", + "outputId": "30aa520a-c8c4-4b53-8659-9f30b068562c" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "import dataclasses\n", - "from torch.utils.data.dataloader import DataLoader\n", - "from transformers.data.data_collator import DataCollatorForLanguageModeling, InputDataClass, DefaultDataCollator\n", - "from torch.utils.data.distributed import DistributedSampler\n", - "from torch.utils.data.sampler import RandomSampler\n", - "from typing import List, Union, Dict\n", - "from transformers import Trainer\n", - "from random import random\n", - "\n", - "\n", - "class NLPDataCollator(DataCollatorForLanguageModeling):\n", - " \"\"\"\n", - " Extending the existing DataCollator to work with NLP dataset batches\n", - " \"\"\"\n", - " def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:\n", - " first = features[0]\n", - " if isinstance(first, dict):\n", - " # NLP data sets current works presents features as lists of dictionary\n", - " # (one per example), so we will adapt the collate_batch logic for that\n", - " if \"labels\" in first and first[\"labels\"] is not None:\n", - " if first[\"labels\"].dtype == torch.int64:\n", - " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.long)\n", - " else:\n", - " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.float)\n", - " batch = {\"labels\": labels}\n", - " for k, v in first.items():\n", - " if k != \"labels\" and v is not None and not isinstance(v, str):\n", - " batch[k] = torch.stack([f[k] for f in features])\n", - " return batch\n", - " else:\n", - " # otherwise, revert to using the default collate_batch\n", - " return DefaultDataCollator().collate_batch(features)\n", - "\n", - "\n", - "class StrIgnoreDevice(str):\n", - " \"\"\"\n", - " This is a hack. The Trainer is going call .to(device) on every input\n", - " value, but we need to pass in an additional `task_name` string.\n", - " This prevents it from throwing an error\n", - " \"\"\"\n", - " def to(self, device):\n", - " return self\n", - "\n", - "class DataLoaderWithTaskname:\n", - " \"\"\"\n", - " Wrapper around a DataLoader to also yield a task name\n", - " \"\"\"\n", - " def __init__(self, task_name, data_loader):\n", - " self.task_name = task_name\n", - " self.data_loader = data_loader\n", - "\n", - " self.batch_size = data_loader.batch_size\n", - " self.dataset = data_loader.dataset\n", - "\n", - " def __len__(self):\n", - " return len(self.data_loader)\n", - " \n", - " def __iter__(self):\n", - " for batch in self.data_loader:\n", - " batch[\"task_name\"] = StrIgnoreDevice(self.task_name)\n", - " yield batch\n", - "\n", - "\n", - "class MultitaskDataloader:\n", - " \"\"\"\n", - " Data loader that combines and samples from multiple single-task\n", - " data loaders.\n", - " \"\"\"\n", - " def __init__(self, dataloader_dict):\n", - " self.dataloader_dict = dataloader_dict\n", - " self.num_batches_dict = {\n", - " task_name: len(dataloader) \n", - " for task_name, dataloader in self.dataloader_dict.items()\n", - " }\n", - " self.task_name_list = list(self.dataloader_dict)\n", - " self.dataset = [None] * sum(\n", - " len(dataloader.dataset) \n", - " for dataloader in self.dataloader_dict.values()\n", - " )\n", - "\n", - " def __len__(self):\n", - " return sum(self.num_batches_dict.values())\n", - "\n", - " def __iter__(self):\n", - " \"\"\"\n", - " For each batch, sample a task, and yield a batch from the respective\n", - " task Dataloader.\n", - "\n", - " We use size-proportional sampling, but you could easily modify this\n", - " to sample from some-other distribution.\n", - " \"\"\"\n", - " task_choice_list = []\n", - " for i, task_name in enumerate(self.task_name_list):\n", - " task_choice_list += [i] * self.num_batches_dict[task_name]\n", - " task_choice_list = np.array(task_choice_list)\n", - " np.random.shuffle(task_choice_list)\n", - " dataloader_iter_dict = {\n", - " task_name: iter(dataloader) \n", - " for task_name, dataloader in self.dataloader_dict.items()\n", - " }\n", - " for task_choice in task_choice_list:\n", - " task_name = self.task_name_list[task_choice]\n", - " yield next(dataloader_iter_dict[task_name]) \n", - "\n", - "class MultitaskTrainer(transformers.Trainer):\n", - "\n", - " def get_single_train_dataloader(self, task_name, train_dataset):\n", - " \"\"\"\n", - " Create a single-task data loader that also yields task names\n", - " \"\"\"\n", - " if self.train_dataset is None:\n", - " raise ValueError(\"Trainer: training requires a train_dataset.\")\n", - " \n", - " train_sampler = (\n", - " RandomSampler(train_dataset)\n", - " if self.args.local_rank == -1\n", - " else DistributedSampler(train_dataset)\n", - " )\n", - "\n", - " data_loader = DataLoaderWithTaskname(\n", - " task_name=task_name,\n", - " data_loader=DataLoader(\n", - " train_dataset,\n", - " batch_size=self.args.train_batch_size,\n", - " sampler=train_sampler\n", - " ),\n", - " )\n", - "\n", - " return data_loader\n", - "\n", - " def get_train_dataloader(self):\n", - " \"\"\"\n", - " Returns a MultitaskDataloader, which is not actually a Dataloader\n", - " but an iterable that returns a generator that samples from each \n", - " task Dataloader\n", - " \"\"\"\n", - " return MultitaskDataloader({\n", - " task_name: self.get_single_train_dataloader(task_name, task_dataset)\n", - " for task_name, task_dataset in self.train_dataset.items()\n", - " })\n", - " \n", - " def train(self):\n", - " config = transformers.AutoConfig.from_pretrained(\"gpt2\")\n", - " model = transformers.AutoModelWithLMHead.from_pretrained(\"gpt2\", config=config)\n", - " trainer = Trainer(\n", - " model=model,\n", - " args=transformers.TrainingArguments(\n", - " output_dir=\"./models/multitask_model\",\n", - " overwrite_output_dir=True,\n", - " learning_rate=1e-5,\n", - " do_train=True,\n", - " num_train_epochs=100,\n", - " # Adjust batch size if this doesn't fit on the Colab GPU\n", - " per_device_train_batch_size=8, \n", - " save_steps=3000,\n", - " ),\n", - " data_collator=data_collator,\n", - " train_dataset=train_dataset,\n", - " )\n", - " trainer.train()\n", - "\n", - " def compute_loss(self, model, inputs, return_outputs=True):\n", - " labels = inputs.get(\"labels\")\n", - " # forward pass\n", - " outputs = model(**inputs)\n", - " reranking_layer(outputs, inputs._get_value(), tokenizer=tokenizer) #input value is tensor\n", - " logits = outputs.get(\"logits\")\n", - " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))\n", - " loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))\n", - " return (loss, outputs) if return_outputs else loss" - ], - "metadata": { - "id": "dzVHNee8EP-T" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "" - ], - "metadata": { - "id": "tzNxUL7gLsYV" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.19.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:915: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", + " FutureWarning,\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", + "\n", + "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", + "/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " FutureWarning,\n", + "***** Running training *****\n", + " Num examples = 288\n", + " Num Epochs = 100\n", + " Instantaneous batch size per device = 8\n", + " Total train batch size (w. parallel, distributed & accumulation) = 8\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 3600\n" + ] }, { - "cell_type": "code", - "source": [ - "trainer = MultitaskTrainer(\n", - " model=multitask_model,\n", - " args=transformers.TrainingArguments(\n", - " output_dir=\"./models/multitask_model\",\n", - " overwrite_output_dir=True,\n", - " learning_rate=1e-5,\n", - " do_train=True,\n", - " num_train_epochs=100,\n", - " # Adjust batch size if this doesn't fit on the Colab GPU\n", - " per_device_train_batch_size=8, \n", - " save_steps=3000,\n", - " ),\n", - " data_collator=data_collator,\n", - ")\n", - "trainer.train()\n", - "\n" + "output_type": "display_data", + "data": { + "text/plain": [ + "" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "TJ3CrZfgHGKq", - "outputId": "30aa520a-c8c4-4b53-8659-9f30b068562c" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "PyTorch: setting up devices\n", - "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", - "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", - "Model config GPT2Config {\n", - " \"_name_or_path\": \"gpt2\",\n", - " \"activation_function\": \"gelu_new\",\n", - " \"architectures\": [\n", - " \"GPT2LMHeadModel\"\n", - " ],\n", - " \"attn_pdrop\": 0.1,\n", - " \"bos_token_id\": 50256,\n", - " \"embd_pdrop\": 0.1,\n", - " \"eos_token_id\": 50256,\n", - " \"initializer_range\": 0.02,\n", - " \"layer_norm_epsilon\": 1e-05,\n", - " \"model_type\": \"gpt2\",\n", - " \"n_ctx\": 1024,\n", - " \"n_embd\": 768,\n", - " \"n_head\": 12,\n", - " \"n_inner\": null,\n", - " \"n_layer\": 12,\n", - " \"n_positions\": 1024,\n", - " \"reorder_and_upcast_attn\": false,\n", - " \"resid_pdrop\": 0.1,\n", - " \"scale_attn_by_inverse_layer_idx\": false,\n", - " \"scale_attn_weights\": true,\n", - " \"summary_activation\": null,\n", - " \"summary_first_dropout\": 0.1,\n", - " \"summary_proj_to_labels\": true,\n", - " \"summary_type\": \"cls_index\",\n", - " \"summary_use_proj\": true,\n", - " \"task_specific_params\": {\n", - " \"text-generation\": {\n", - " \"do_sample\": true,\n", - " \"max_length\": 50\n", - " }\n", - " },\n", - " \"transformers_version\": \"4.19.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 50257\n", - "}\n", - "\n", - "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:915: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", - " FutureWarning,\n", - "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", - "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", - "\n", - "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", - "PyTorch: setting up devices\n", - "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", - "/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " FutureWarning,\n", - "***** Running training *****\n", - " Num examples = 288\n", - " Num Epochs = 100\n", - " Instantaneous batch size per device = 8\n", - " Total train batch size (w. parallel, distributed & accumulation) = 8\n", - " Gradient Accumulation steps = 1\n", - " Total optimization steps = 3600\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [ 383/3600 1:49:51 < 15:27:36, 0.06 it/s, Epoch 10.61/100]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss

" - ] - }, - "metadata": {} - } + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [ 383/3600 1:49:51 < 15:27:36, 0.06 it/s, Epoch 10.61/100]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss

" ] - }, - { - "cell_type": "code", - "source": [ - " preds_dict = {}\n", - " for task_name in [\"token\", \"token_type\", \"line\"]:\n", - " eval_dataloader = DataLoaderWithTaskname(\n", - " task_name,\n", - " trainer.get_eval_dataloader(eval_dataset=dataset_dict[task_name])\n", - " )\n", - " print(eval_dataloader.data_loader.collate_fn)\n", - " preds_dict[task_name] = trainer.prediction_loop(\n", - " eval_dataloader, \n", - " description=f\"Validation: {task_name}\",\n", - " )\n", - "\n", - "\n", - " print(preds_dict)" - ], - "metadata": { - "id": "Xgw82zyxp-_5" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from sklearn.metrics import accuracy_score, label_ranking_average_precision_score\n", - "\n", - "accuracy_dict = {}\n", - "mrr_dict = {}\n", - "\n", - "for task_name in [\"token\", \"token_type\", \"line\"]:\n", - " accuracy_dict[task_name] = accuracy_score(preds_dict[task_name].predictions.flatten(),\n", - " preds_dict[task_name].label_ids)\n", - " \n", - " mrr_dict[task_name] = label_ranking_average_precision_score(preds_dict[task_name].predictions.flatten(),\n", - " preds_dict[task_name].label_ids)\n", - " " - ], - "metadata": { - "id": "XWUxUWUVE5Dq" - }, - "execution_count": null, - "outputs": [] + }, + "metadata": {} } - ] + ] + }, + { + "cell_type": "code", + "source": [ + " preds_dict = {}\n", + " for task_name in [\"token\", \"token_type\", \"line\"]:\n", + " eval_dataloader = DataLoaderWithTaskname(\n", + " task_name,\n", + " trainer.get_eval_dataloader(eval_dataset=dataset_dict[task_name])\n", + " )\n", + " print(eval_dataloader.data_loader.collate_fn)\n", + " preds_dict[task_name] = trainer.prediction_loop(\n", + " eval_dataloader, \n", + " description=f\"Validation: {task_name}\",\n", + " )\n", + "\n", + "\n", + " print(preds_dict)" + ], + "metadata": { + "id": "Xgw82zyxp-_5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import accuracy_score, label_ranking_average_precision_score\n", + "\n", + "accuracy_dict = {}\n", + "mrr_dict = {}\n", + "\n", + "for task_name in [\"token\", \"token_type\", \"line\"]:\n", + " accuracy_dict[task_name] = accuracy_score(preds_dict[task_name].predictions.flatten(),\n", + " preds_dict[task_name].label_ids)\n", + " \n", + " mrr_dict[task_name] = label_ranking_average_precision_score(preds_dict[task_name].predictions.flatten(),\n", + " preds_dict[task_name].label_ids)\n", + " " + ], + "metadata": { + "id": "XWUxUWUVE5Dq" + }, + "execution_count": null, + "outputs": [] + } + ] } \ No newline at end of file diff --git a/notebooks/Evaluation_of_CodeFill.ipynb b/notebooks/Evaluation_of_CodeFill.ipynb new file mode 100644 index 0000000..c31c519 --- /dev/null +++ b/notebooks/Evaluation_of_CodeFill.ipynb @@ -0,0 +1,1422 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Evaluation of CodeFill.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "03103d813d964d819b6a2c0a1afac919": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b478a5503c314e8880931716b4d3be31", + "IPY_MODEL_600a7388c66a464285e598195ef980a0", + "IPY_MODEL_c2f23a4bfec2424f8556a460757b7f7c" + ], + "layout": "IPY_MODEL_f8962c79a027458e8d282f95b9db14e6" + } + }, + "b478a5503c314e8880931716b4d3be31": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_17fd247324f84aedbd3cf77d9e6a9970", + "placeholder": "​", + "style": "IPY_MODEL_a44ac743025e4b9db4161a1215b35904", + "value": "Downloading: 100%" + } + }, + "600a7388c66a464285e598195ef980a0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ca2c310b21a049aa85be1be907b14456", + "max": 548118077, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3097af23a3e547f9a34d6d5e61e189c9", + "value": 548118077 + } + }, + "c2f23a4bfec2424f8556a460757b7f7c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_36fcac3bb12e4934a548fcc2e85eee8a", + "placeholder": "​", + "style": "IPY_MODEL_74028709db644e8c9ef1865cf4396e63", + "value": " 523M/523M [00:12<00:00, 41.5MB/s]" + } + }, + "f8962c79a027458e8d282f95b9db14e6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "17fd247324f84aedbd3cf77d9e6a9970": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a44ac743025e4b9db4161a1215b35904": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ca2c310b21a049aa85be1be907b14456": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3097af23a3e547f9a34d6d5e61e189c9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "36fcac3bb12e4934a548fcc2e85eee8a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74028709db644e8c9ef1865cf4396e63": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Install the correct dependencies on HuggingFace transformer and ternsorflow" + ], + "metadata": { + "id": "xIT_uHUdThub" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f3KWTckbTUs2", + "outputId": "3a3b09c5-4700-4c1e-8a90-5f74584fdd8c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Found existing installation: tensorflow 2.8.0\n", + "Uninstalling tensorflow-2.8.0:\n", + " Successfully uninstalled tensorflow-2.8.0\n", + "Collecting git+https://github.com/huggingface/transformers\n", + " Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-plp9s8wa\n", + " Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-plp9s8wa\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.20.0.dev0) (2.23.0)\n", + "Collecting huggingface-hub<1.0,>=0.1.0\n", + " Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)\n", + "\u001b[K |████████████████████████████████| 84 kB 3.5 MB/s \n", + "\u001b[?25hRequirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.20.0.dev0) (4.11.3)\n", + "Collecting tokenizers!=0.11.3,<0.13,>=0.11.1\n", + " Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n", + "\u001b[K |████████████████████████████████| 6.6 MB 41.4 MB/s \n", + "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers==4.20.0.dev0) (21.3)\n", + "Collecting pyyaml>=5.1\n", + " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", + "\u001b[K |████████████████████████████████| 596 kB 52.4 MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.20.0.dev0) (4.64.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.20.0.dev0) (1.21.6)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.20.0.dev0) (3.6.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.20.0.dev0) (2019.12.20)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers==4.20.0.dev0) (4.2.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers==4.20.0.dev0) (3.0.8)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.20.0.dev0) (3.8.0)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.20.0.dev0) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.20.0.dev0) (2021.10.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.20.0.dev0) (3.0.4)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.20.0.dev0) (2.10)\n", + "Building wheels for collected packages: transformers\n", + " Building wheel for transformers (PEP 517) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for transformers: filename=transformers-4.20.0.dev0-py3-none-any.whl size=4171103 sha256=8b39ec060327b98dc94c8a22f4026398eecafcdceea6576419f831cee6db09b5\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-0z34eq2h/wheels/35/2e/a7/d819e3310040329f0f47e57c9e3e7a7338aa5e74c49acfe522\n", + "Successfully built transformers\n", + "Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers\n", + " Attempting uninstall: pyyaml\n", + " Found existing installation: PyYAML 3.13\n", + " Uninstalling PyYAML-3.13:\n", + " Successfully uninstalled PyYAML-3.13\n", + "Successfully installed huggingface-hub-0.6.0 pyyaml-6.0 tokenizers-0.12.1 transformers-4.20.0.dev0\n", + "tokenizers 0.12.1\n", + "transformers 4.20.0.dev0\n", + "Collecting nlp==0.2.0\n", + " Downloading nlp-0.2.0-py3-none-any.whl (857 kB)\n", + "\u001b[K |████████████████████████████████| 857 kB 28.1 MB/s \n", + "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (0.3.4)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (1.21.6)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (4.64.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (3.6.0)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (2.23.0)\n", + "Requirement already satisfied: pyarrow>=0.16.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (6.0.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2021.10.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (1.24.3)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (3.0.4)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2.10)\n", + "Installing collected packages: nlp\n", + "Successfully installed nlp-0.2.0\n", + "Collecting datasets\n", + " Downloading datasets-2.2.1-py3-none-any.whl (342 kB)\n", + "\u001b[K |████████████████████████████████| 342 kB 20.2 MB/s \n", + "\u001b[?25hRequirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.6.0)\n", + "Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n", + "Collecting responses<0.19\n", + " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.4)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.12.2)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.3.5)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.21.6)\n", + "Collecting xxhash\n", + " Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", + "\u001b[K |████████████████████████████████| 212 kB 47.2 MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.64.0)\n", + "Collecting aiohttp\n", + " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", + "\u001b[K |████████████████████████████████| 1.1 MB 56.8 MB/s \n", + "\u001b[?25hRequirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.11.3)\n", + "Collecting fsspec[http]>=2021.05.0\n", + " Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)\n", + "\u001b[K |████████████████████████████████| 136 kB 72.1 MB/s \n", + "\u001b[?25hRequirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.2.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.8)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n", + "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n", + " Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n", + "\u001b[K |████████████████████████████████| 127 kB 65.9 MB/s \n", + "\u001b[?25hRequirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.12)\n", + "Collecting frozenlist>=1.1.1\n", + " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", + "\u001b[K |████████████████████████████████| 144 kB 43.4 MB/s \n", + "\u001b[?25hCollecting aiosignal>=1.1.2\n", + " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", + "Collecting asynctest==0.13.0\n", + " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.4.0)\n", + "Collecting multidict<7.0,>=4.5\n", + " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", + "\u001b[K |████████████████████████████████| 94 kB 3.2 MB/s \n", + "\u001b[?25hCollecting yarl<2.0,>=1.0\n", + " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", + "\u001b[K |████████████████████████████████| 271 kB 52.3 MB/s \n", + "\u001b[?25hCollecting async-timeout<5.0,>=4.0.0a3\n", + " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.8.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2022.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", + "Installing collected packages: multidict, frozenlist, yarl, urllib3, asynctest, async-timeout, aiosignal, fsspec, aiohttp, xxhash, responses, datasets\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 1.24.3\n", + " Uninstalling urllib3-1.24.3:\n", + " Successfully uninstalled urllib3-1.24.3\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "kapre 0.3.7 requires tensorflow>=2.0.0, which is not installed.\n", + "datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\u001b[0m\n", + "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-2.2.1 frozenlist-1.3.0 fsspec-2022.3.0 multidict-6.0.2 responses-0.18.0 urllib3-1.25.11 xxhash-3.0.0 yarl-1.7.2\n", + "Collecting git+https://github.com/huggingface/nlp\n", + " Cloning https://github.com/huggingface/nlp to /tmp/pip-req-build-feh2or5e\n", + " Running command git clone -q https://github.com/huggingface/nlp /tmp/pip-req-build-feh2or5e\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (1.21.6)\n", + "Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (6.0.1)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (0.3.4)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (1.3.5)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (2.23.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (4.64.0)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (3.0.0)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (0.70.12.2)\n", + "Requirement already satisfied: fsspec[http]>=2021.05.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (2022.3.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (3.8.1)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (0.6.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (21.3)\n", + "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (0.18.0)\n", + "Requirement already satisfied: importlib_metadata in /usr/local/lib/python3.7/dist-packages (from datasets==2.2.2.dev0) (4.11.3)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.2.2.dev0) (3.6.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.2.2.dev0) (6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.2.2.dev0) (4.2.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets==2.2.2.dev0) (3.0.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.2.2.dev0) (2021.10.8)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.2.2.dev0) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.2.2.dev0) (1.25.11)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.2.2.dev0) (3.0.4)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.2.2.dev0) (2.0.12)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.2.2.dev0) (21.4.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.2.2.dev0) (1.7.2)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.2.2.dev0) (4.0.2)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.2.2.dev0) (6.0.2)\n", + "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.2.2.dev0) (0.13.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.2.2.dev0) (1.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.2.2.dev0) (1.3.0)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib_metadata->datasets==2.2.2.dev0) (3.8.0)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.2.2.dev0) (2022.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.2.2.dev0) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets==2.2.2.dev0) (1.15.0)\n", + "Building wheels for collected packages: datasets\n", + " Building wheel for datasets (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for datasets: filename=datasets-2.2.2.dev0-py3-none-any.whl size=344393 sha256=a978a302726ae208dbbf56e886c30425f98b51e04f16a8abf0993ea4f0f473a7\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-f7wyr7sc/wheels/b7/b2/b6/a0b4e0d11cb66d705e54f7bb72fdbe910b5e9f198ada8b4347\n", + "Successfully built datasets\n", + "Installing collected packages: datasets\n", + " Attempting uninstall: datasets\n", + " Found existing installation: datasets 2.2.1\n", + " Uninstalling datasets-2.2.1:\n", + " Successfully uninstalled datasets-2.2.1\n", + "Successfully installed datasets-2.2.2.dev0\n" + ] + } + ], + "source": [ + "# We won't need TensorFlow here\n", + "!pip uninstall -y tensorflow\n", + "# Install `transformers` from master\n", + "!pip install git+https://github.com/huggingface/transformers\n", + "!pip list | grep -E 'transformers|tokenizers'\n", + "!pip install nlp==0.2.0\n", + "!pip install datasets\n", + "!pip install git+https://github.com/huggingface/nlp\n", + "\n", + "# transformers version at notebook update --- 2.11.0\n", + "# tokenizers version at notebook update --- 0.8.0rc1" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Train a customised python byte-level Byte-pair encoding tokenizer. " + ], + "metadata": { + "id": "M0wmpgCxUIF3" + } + }, + { + "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from transformers import AutoTokenizer,TextDataset,DataCollatorForLanguageModeling\n", + "import glob\n", + "import random \n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n" + ], + "metadata": { + "id": "oPq1Bau8UbpB", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "84eaefed-3eee-452f-fa68-c43cc1052d3c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.20.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f\n", + "loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n", + "loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0\n", + "loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None\n", + "loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None\n", + "loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.20.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def compute_metric():\n", + "\n", + " nlp.load_metric('accuracy', name=\"token\").compute(\n", + " np.argmax(preds_dict[\"token\"].predictions, axis=1),\n", + " preds_dict[\"token\"].label_ids,\n", + " )\n", + "\n", + " nlp.load_metric('average_precision_score', name=\"token\").compute(\n", + " np.argmax(preds_dict[\"token\"].predictions, axis=1),\n", + " preds_dict[\"token\"].label_ids,\n", + " )\n", + "\n", + " nlp.load_metric('bleu', name=\"line\").compute(\n", + " np.argmax(preds_dict[\"line\"].predictions, axis=1),\n", + " preds_dict[\"line\"].label_ids,\n", + " )\n", + "\n", + " nlp.load_metric('meteor', name=\"line\").compute(\n", + " np.argmax(preds_dict[\"line\"].predictions, axis=1),\n", + " preds_dict[\"line\"].label_ids,\n", + " )\n", + "\n", + " nlp.load_metric('rouge', name=\"line\").compute(\n", + " np.argmax(preds_dict[\"line\"].predictions, axis=1),\n", + " preds_dict[\"line\"].label_ids,\n", + " )\n" + ], + "metadata": { + "id": "aJkbrM1h2gH0" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "paths = [str(x) for x in Path(\".\").glob(\"./sample_data/test_data/*.py\")]\n", + "converted_paths = []\n", + "for path in paths:\n", + " converted_path = \"./sample_data/test_data/\"+ path.split(\"/\").pop().split(\".\")[0] + \".txt\"\n", + " print(converted_path)\n", + " try:\n", + " convert(path, converted_path)\n", + " converted_paths.append(converted_path)\n", + " except:\n", + " pass\n", + "\n", + "\n", + "with open(\"./test.txt\", \"wb\") as test_outfile:\n", + " for f in paths:\n", + " with open(f, \"rb\") as infile:\n", + " test_outfile.write(infile.read())\n", + "\n", + "with open(\"./converted_test.txt\", \"wb\") as test_outfile:\n", + " for f in converted_paths:\n", + " with open(f, \"rb\") as infile:\n", + " test_outfile.write(infile.read())\n", + "\n" + ], + "metadata": { + "id": "vOXkK5bWYNXz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def load_dataset(test_path,tokenizer):\n", + " \n", + " test_dataset = TextDataset(\n", + " tokenizer=tokenizer,\n", + " file_path=test_path,\n", + " block_size=128) \n", + " \n", + " data_collator = DataCollatorForLanguageModeling(\n", + " tokenizer=tokenizer, mlm=False,\n", + " )\n", + " return test_dataset,data_collator\n", + "\n", + "test_dataset,data_collator = load_dataset(\"./train.txt\", \"./test.txt\",tokenizer)\n", + "converted_test_dataset, converted_datacollator = load_dataset(\"./converted_train.txt\", \"./converted_test.txt\",tokenizer)\n", + "#pretrain_raw_files = glob.glob(\"./pretrain_dataset\" + '/**/*.py', recursive=True)\n", + "#pretrain_converted_files = glob.glob(\"./pretrain_converted_dataset\" + '/**/*.py', recursive=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Q4kLrTZXTjZ", + "outputId": "254ec48b-7aa2-49b6-9b92-7c5d6c673794" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/transformers/data/datasets/language_modeling.py:58: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n", + " FutureWarning,\n", + "Loading features from cached file ./cached_lm_GPT2TokenizerFast_128_train.txt [took 0.000 s]\n", + "Loading features from cached file ./cached_lm_GPT2TokenizerFast_128_test.txt [took 0.000 s]\n", + "Loading features from cached file ./cached_lm_GPT2TokenizerFast_128_converted_train.txt [took 0.000 s]\n", + "Loading features from cached file ./cached_lm_GPT2TokenizerFast_128_converted_test.txt [took 0.000 s]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "tokenizer(\"for i in range(10)\")[\"input_ids\"]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0AraoltupXmb", + "outputId": "c90124e7-3695-44af-827d-c355632ec2af" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[1640, 1312, 287, 2837, 7, 940, 8]" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import transformers\n", + "import nlp\n", + "import logging\n", + "from datasets import load_dataset\n", + "from transformers import TextDataset,DataCollatorForLanguageModeling\n", + "\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "dataset_dict = {\n", + " \"token\": train_dataset,\n", + " \"token_type\": train_dataset,\n", + " \"line\": train_dataset,\n", + "}\n", + "\n", + "print(dataset_dict[\"token\"])\n" + ], + "metadata": { + "id": "4ePqOauaqjnZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from transformers.utils.dummy_pt_objects import GPT2LMHeadModel\n", + "from transformers import GPT2Tokenizer\n", + "from transformers import GPT2Config, EncoderDecoderConfig, EncoderDecoderModel\n", + "\n", + "\n", + "class MultitaskModel(transformers.PreTrainedModel):\n", + " def __init__(self, encoder, taskmodels_dict):\n", + " \"\"\"\n", + " Setting MultitaskModel up as a PretrainedModel allows us\n", + " to take better advantage of Trainer features\n", + " \"\"\"\n", + " super().__init__(transformers.PretrainedConfig())\n", + "\n", + " self.encoder = encoder\n", + " self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)\n", + "\n", + " def _get_models(self):\n", + " return self.taskmodels_dict\n", + "\n", + " @classmethod\n", + " def create(cls, model_name, model_type_dict, model_config_dict):\n", + " \"\"\"\n", + " This creates a MultitaskModel using the model class and config objects\n", + " from single-task models. \n", + "\n", + " We do this by creating each single-task model, and having them share\n", + " the same encoder transformer.\n", + " \"\"\"\n", + " shared_encoder = None\n", + " taskmodels_dict = {}\n", + " for task_name, model_type in model_type_dict.items():\n", + " model = model_type.from_pretrained( \"gpt2\",\n", + " config=model_config_dict[task_name],\n", + " )\n", + " if shared_encoder is None:\n", + " shared_encoder = cls.get_encoder(model)\n", + " else:\n", + " setattr(model, \"encoder\", shared_encoder)\n", + " taskmodels_dict[task_name] = model\n", + " return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)\n", + " \n", + "\n", + " @classmethod\n", + " def get_encoder(cls, model):\n", + " \"\"\"\n", + " The encoder transformer is named differently in each model \"architecture\".\n", + " This method lets us get the name of the encoder attribute\n", + " \"\"\"\n", + " model_class_name = model.__class__.__name__\n", + " if model_class_name.startswith(\"Roberta\"):\n", + " return \"roberta-base\"\n", + " elif model_class_name.startswith(\"GPT2\"):\n", + " config = EncoderDecoderConfig.from_encoder_decoder_configs(model.config, model.config) \n", + " encoder_decoder = EncoderDecoderModel(config=config)\n", + " return encoder_decoder.config.encoder\n", + " else:\n", + " raise KeyError(f\"Add support for new model {model_class_name}\")\n", + " \n", + " def forward(self, task_name, **kwargs):\n", + " return self.taskmodels_dict[task_name](**kwargs)" + ], + "metadata": { + "id": "jJLxsM_H4A6z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model_name = \"gpt2\"\n", + "multitask_model = MultitaskModel.create(\n", + " model_name=model_name,\n", + " model_type_dict={\n", + " \"token\": transformers.AutoModelWithLMHead,\n", + " \"token_type\": transformers.AutoModelWithLMHead,\n", + " \"line\": transformers.AutoModelForSequenceClassification,\n", + " },\n", + " model_config_dict={\n", + " \"token\": transformers.AutoConfig.from_pretrained(model_name),\n", + " \"token_type\": transformers.AutoConfig.from_pretrained(model_name),\n", + " \"line\": transformers.AutoConfig.from_pretrained(model_name),\n", + " },\n", + ")" + ], + "metadata": { + "id": "9bbwa7E74Q0x", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 140, + "referenced_widgets": [ + "03103d813d964d819b6a2c0a1afac919", + "b478a5503c314e8880931716b4d3be31", + "600a7388c66a464285e598195ef980a0", + "c2f23a4bfec2424f8556a460757b7f7c", + "f8962c79a027458e8d282f95b9db14e6", + "17fd247324f84aedbd3cf77d9e6a9970", + "a44ac743025e4b9db4161a1215b35904", + "ca2c310b21a049aa85be1be907b14456", + "3097af23a3e547f9a34d6d5e61e189c9", + "36fcac3bb12e4934a548fcc2e85eee8a", + "74028709db644e8c9ef1865cf4396e63" + ] + }, + "outputId": "776e3562-c46f-47da-d4b9-217b85d9cb79" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:925: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", + " FutureWarning,\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading: 0%| | 0.00/523M [00:00 Dict[str, torch.Tensor]:\n", + " first = features[0]\n", + " if isinstance(first, dict):\n", + " # NLP data sets current works presents features as lists of dictionary\n", + " # (one per example), so we will adapt the collate_batch logic for that\n", + " if \"labels\" in first and first[\"labels\"] is not None:\n", + " if first[\"labels\"].dtype == torch.int64:\n", + " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.long)\n", + " else:\n", + " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.float)\n", + " batch = {\"labels\": labels}\n", + " for k, v in first.items():\n", + " if k != \"labels\" and v is not None and not isinstance(v, str):\n", + " batch[k] = torch.stack([f[k] for f in features])\n", + " return batch\n", + " else:\n", + " # otherwise, revert to using the default collate_batch\n", + " return DefaultDataCollator().collate_batch(features)\n", + "\n", + "\n", + "class StrIgnoreDevice(str):\n", + " \"\"\"\n", + " This is a hack. The Trainer is going call .to(device) on every input\n", + " value, but we need to pass in an additional `task_name` string.\n", + " This prevents it from throwing an error\n", + " \"\"\"\n", + " def to(self, device):\n", + " return self\n", + "\n", + "class DataLoaderWithTaskname:\n", + " \"\"\"\n", + " Wrapper around a DataLoader to also yield a task name\n", + " \"\"\"\n", + " def __init__(self, task_name, data_loader):\n", + " self.task_name = task_name\n", + " self.data_loader = data_loader\n", + "\n", + " self.batch_size = data_loader.batch_size\n", + " self.dataset = data_loader.dataset\n", + "\n", + " def __len__(self):\n", + " return len(self.data_loader)\n", + " \n", + " def __iter__(self):\n", + " for batch in self.data_loader:\n", + " batch[\"task_name\"] = StrIgnoreDevice(self.task_name)\n", + " yield batch\n", + "\n", + "\n", + "class MultitaskDataloader:\n", + " \"\"\"\n", + " Data loader that combines and samples from multiple single-task\n", + " data loaders.\n", + " \"\"\"\n", + " def __init__(self, dataloader_dict):\n", + " self.dataloader_dict = dataloader_dict\n", + " self.num_batches_dict = {\n", + " task_name: len(dataloader) \n", + " for task_name, dataloader in self.dataloader_dict.items()\n", + " }\n", + " self.task_name_list = list(self.dataloader_dict)\n", + " self.dataset = [None] * sum(\n", + " len(dataloader.dataset) \n", + " for dataloader in self.dataloader_dict.values()\n", + " )\n", + "\n", + " def __len__(self):\n", + " return sum(self.num_batches_dict.values())\n", + "\n", + " def __iter__(self):\n", + " \"\"\"\n", + " For each batch, sample a task, and yield a batch from the respective\n", + " task Dataloader.\n", + "\n", + " We use size-proportional sampling, but you could easily modify this\n", + " to sample from some-other distribution.\n", + " \"\"\"\n", + " task_choice_list = []\n", + " for i, task_name in enumerate(self.task_name_list):\n", + " task_choice_list += [i] * self.num_batches_dict[task_name]\n", + " task_choice_list = np.array(task_choice_list)\n", + " np.random.shuffle(task_choice_list)\n", + " dataloader_iter_dict = {\n", + " task_name: iter(dataloader) \n", + " for task_name, dataloader in self.dataloader_dict.items()\n", + " }\n", + " for task_choice in task_choice_list:\n", + " task_name = self.task_name_list[task_choice]\n", + " yield next(dataloader_iter_dict[task_name]) \n", + "\n", + "class MultitaskTrainer(transformers.Trainer):\n", + "\n", + " def get_single_train_dataloader(self, task_name, train_dataset):\n", + " \"\"\"\n", + " Create a single-task data loader that also yields task names\n", + " \"\"\"\n", + " if self.train_dataset is None:\n", + " raise ValueError(\"Trainer: training requires a train_dataset.\")\n", + " \n", + " train_sampler = (\n", + " RandomSampler(train_dataset)\n", + " if self.args.local_rank == -1\n", + " else DistributedSampler(train_dataset)\n", + " )\n", + "\n", + " data_loader = DataLoaderWithTaskname(\n", + " task_name=task_name,\n", + " data_loader=DataLoader(\n", + " train_dataset,\n", + " batch_size=self.args.train_batch_size,\n", + " sampler=train_sampler\n", + " ),\n", + " )\n", + "\n", + " return data_loader\n", + "\n", + " def get_train_dataloader(self):\n", + " \"\"\"\n", + " Returns a MultitaskDataloader, which is not actually a Dataloader\n", + " but an iterable that returns a generator that samples from each \n", + " task Dataloader\n", + " \"\"\"\n", + " return MultitaskDataloader({\n", + " task_name: self.get_single_train_dataloader(task_name, task_dataset)\n", + " for task_name, task_dataset in self.train_dataset.items()\n", + " })\n", + " \n", + " def train(self):\n", + " config = transformers.AutoConfig.from_pretrained(\"gpt2\")\n", + " model = transformers.AutoModelWithLMHead.from_pretrained(\"gpt2\", config=config)\n", + " self.trainer = Trainer(\n", + " model=model,\n", + " args=transformers.TrainingArguments(\n", + " output_dir=\"./models/multitask_model\",\n", + " overwrite_output_dir=True,\n", + " learning_rate=1e-5,\n", + " do_train=True,\n", + " num_train_epochs=100,\n", + " # Adjust batch size if this doesn't fit on the Colab GPU\n", + " per_device_train_batch_size=8, \n", + " save_steps=3000,\n", + " ),\n", + " data_collator=data_collator,\n", + " train_dataset=train_dataset,\n", + " )\n", + " self.trainer.train()\n", + " \n", + " def evaluate(self):\n", + " config = transformers.AutoConfig.from_pretrained(\"gpt2\")\n", + " model = transformers.AutoModelWithLMHead.from_pretrained(\"gpt2\", config=config)\n", + " self.trainer = Trainer(\n", + " model=model,\n", + " args=transformers.TrainingArguments(\n", + " output_dir=\"./models/multitask_model\",\n", + " overwrite_output_dir=True,\n", + " learning_rate=1e-5,\n", + " do_train=True,\n", + " num_train_epochs=100,\n", + " # Adjust batch size if this doesn't fit on the Colab GPU\n", + " per_device_train_batch_size=8, \n", + " save_steps=3000,\n", + " compute_metrics=compute_metrics\n", + " ),\n", + " data_collator=data_collator,\n", + " train_dataset=train_dataset,\n", + " )\n", + " self.trainer.train()\n", + "\n", + " #run evaluation first to set the compute metrics to compute full set of metrics \n", + " def prediction_loop(self):\n", + " return self.trainer.predict()\n", + "\n", + " def compute_loss(self, model, inputs, return_outputs=True):\n", + " labels = inputs.get(\"labels\")\n", + " # forward pass\n", + " outputs = model(**inputs)\n", + " reranking_layer(outputs, inputs._get_value(), tokenizer=tokenizer) #input value is tensor\n", + " logits = outputs.get(\"logits\")\n", + " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))\n", + " loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))\n", + " return (loss, outputs) if return_outputs else loss" + ], + "metadata": { + "id": "dzVHNee8EP-T" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# point to training folder \n", + "model_path = \"./models/multitask_model\"\n", + "model = AutoModelWithLMHead.from_pretrained(model_path)\n", + "\n", + "# Define test trainer\n", + "test_trainer = Trainer(model)\n", + "trainer.train()\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "TJ3CrZfgHGKq", + "outputId": "3d86f277-b512-4468-f736-242baf745030" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.20.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:925: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", + " FutureWarning,\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", + "\n", + "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n" + ] + }, + { + "output_type": "error", + "ename": "ValueError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mdata_collator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata_collator\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m )\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0mtrain_dataset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m )\n\u001b[0;32m--> 160\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_outputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1322\u001b[0m \u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1323\u001b[0m \u001b[0mtrial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1324\u001b[0;31m \u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1325\u001b[0m )\n\u001b[1;32m 1326\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1330\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_train_batch_size\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1331\u001b[0m \u001b[0;31m# Data loader and number of training steps\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1332\u001b[0;31m \u001b[0mtrain_dataloader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_train_dataloader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1333\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1334\u001b[0m \u001b[0;31m# Setting up training control variables:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mget_train_dataloader\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 767\u001b[0m )\n\u001b[1;32m 768\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 769\u001b[0;31m \u001b[0mtrain_sampler\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_train_sampler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 770\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 771\u001b[0m return DataLoader(\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_get_train_sampler\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 708\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mworld_size\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 709\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m_is_torch_generator_available\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 710\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mRandomSampler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_dataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgenerator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgenerator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 711\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mRandomSampler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_dataset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 712\u001b[0m elif (\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/torch/utils/data/sampler.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data_source, replacement, num_samples, generator)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_samples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_samples\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 97\u001b[0m raise ValueError(\"num_samples should be a positive integer \"\n\u001b[0;32m---> 98\u001b[0;31m \"value, but got num_samples={}\".format(self.num_samples))\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: num_samples should be a positive integer value, but got num_samples=0" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "preds_dict = {}\n", + "for task_name in [\"token\", \"token_type\", \"line\"]:\n", + " eval_dataloader = DataLoaderWithTaskname(\n", + " task_name,\n", + " trainer.get_eval_dataloader(eval_dataset=dataset_dict[task_name])\n", + " )\n", + " print(eval_dataloader.data_loader.collate_fn)\n", + " preds_dict[task_name] = trainer.prediction_loop(\n", + " eval_dataloader, \n", + " description=f\"Validation: {task_name}\",\n", + " )\n", + "\n", + "\n", + "print(preds_dict)" + ], + "metadata": { + "id": "Xgw82zyxp-_5", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6be34973-95c4-4acf-d3a4-1a072e36373a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "***** Running Validation: token *****\n", + " Num examples = 0\n", + " Batch size = 8\n", + "/usr/local/lib/python3.7/dist-packages/transformers/trainer_pt_utils.py:395: FutureWarning: DistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.\n", + " FutureWarning,\n", + "***** Running Validation: token_type *****\n", + " Num examples = 0\n", + " Batch size = 8\n", + "***** Running Validation: line *****\n", + " Num examples = 0\n", + " Batch size = 8\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import accuracy_score, label_ranking_average_precision_score\n", + "\n", + "accuracy_dict = {}\n", + "mrr_dict = {}\n", + "\n", + "for task_name in [\"token\", \"token_type\", \"line\"]:\n", + " accuracy_dict[task_name] = accuracy_score(preds_dict[task_name].predictions.flatten(),\n", + " preds_dict[task_name].label_ids)\n", + " \n", + " mrr_dict[task_name] = label_ranking_average_precision_score(preds_dict[task_name].predictions.flatten(),\n", + " preds_dict[task_name].label_ids)\n", + " \n", + " " + ], + "metadata": { + "id": "XWUxUWUVE5Dq" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file