diff --git a/notebooks/CodeFill.ipynb b/notebooks/CodeFill.ipynb index 4761044..b26ce26 100644 --- a/notebooks/CodeFill.ipynb +++ b/notebooks/CodeFill.ipynb @@ -1,1388 +1,1391 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "CodeFill.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Install the correct dependencies on HuggingFace transformer and ternsorflow" + ], + "metadata": { + "id": "xIT_uHUdThub" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "name": "CodeFill.ipynb", - "provenance": [], - "collapsed_sections": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" + "id": "f3KWTckbTUs2", + "outputId": "2012bedd-8fb1-4909-d4bc-1f0ce0ad416e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Found existing installation: tensorflow 2.8.0\n", + "Uninstalling tensorflow-2.8.0:\n", + " Successfully uninstalled tensorflow-2.8.0\n", + "Collecting git+https://github.com/huggingface/transformers\n", + " Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-nl4u3bjj\n", + " Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-nl4u3bjj\n", + " Installing build dependencies ... \u001B[?25l\u001B[?25hdone\n", + " Getting requirements to build wheel ... \u001B[?25l\u001B[?25hdone\n", + " Preparing wheel metadata ... \u001B[?25l\u001B[?25hdone\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (2.23.0)\n", + "Collecting huggingface-hub<1.0,>=0.1.0\n", + " Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)\n", + "\u001B[K |████████████████████████████████| 77 kB 4.2 MB/s \n", + "\u001B[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (2019.12.20)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (4.11.3)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (21.3)\n", + "Collecting pyyaml>=5.1\n", + " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", + "\u001B[K |████████████████████████████████| 596 kB 16.8 MB/s \n", + "\u001B[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1\n", + " Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n", + "\u001B[K |████████████████████████████████| 6.6 MB 42.7 MB/s \n", + "\u001B[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (3.6.0)\n", + "Collecting sacremoses\n", + " Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)\n", + "\u001B[K |████████████████████████████████| 895 kB 44.5 MB/s \n", + "\u001B[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (4.64.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (1.21.6)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers==4.19.0.dev0) (4.2.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers==4.19.0.dev0) (3.0.8)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.19.0.dev0) (3.8.0)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (2021.10.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (3.0.4)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (1.1.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (7.1.2)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (1.15.0)\n", + "Building wheels for collected packages: transformers\n", + " Building wheel for transformers (PEP 517) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for transformers: filename=transformers-4.19.0.dev0-py3-none-any.whl size=4040857 sha256=84073fb7ad0bd8a06a08636e44e15aadf97eb4dd26c2d2a0e3dfc90232f8158b\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-ii8u1on2/wheels/35/2e/a7/d819e3310040329f0f47e57c9e3e7a7338aa5e74c49acfe522\n", + "Successfully built transformers\n", + "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", + " Attempting uninstall: pyyaml\n", + " Found existing installation: PyYAML 3.13\n", + " Uninstalling PyYAML-3.13:\n", + " Successfully uninstalled PyYAML-3.13\n", + "Successfully installed huggingface-hub-0.5.1 pyyaml-6.0 sacremoses-0.0.49 tokenizers-0.12.1 transformers-4.19.0.dev0\n", + "tokenizers 0.12.1\n", + "transformers 4.19.0.dev0\n", + "Collecting nlp==0.2.0\n", + " Downloading nlp-0.2.0-py3-none-any.whl (857 kB)\n", + "\u001B[K |████████████████████████████████| 857 kB 8.9 MB/s \n", + "\u001B[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (3.6.0)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (2.23.0)\n", + "Requirement already satisfied: pyarrow>=0.16.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (6.0.1)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (1.21.6)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (4.64.0)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (0.3.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2021.10.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2.10)\n", + "Installing collected packages: nlp\n", + "Successfully installed nlp-0.2.0\n", + "Collecting datasets\n", + " Downloading datasets-2.1.0-py3-none-any.whl (325 kB)\n", + "\u001B[K |████████████████████████████████| 325 kB 8.6 MB/s \n", + "\u001B[?25hRequirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.4)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.12.2)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.21.6)\n", + "Collecting fsspec[http]>=2021.05.0\n", + " Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)\n", + "\u001B[K |████████████████████████████████| 136 kB 60.7 MB/s \n", + "\u001B[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.11.3)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.5.1)\n", + "Collecting xxhash\n", + " Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", + "\u001B[K |████████████████████████████████| 212 kB 49.9 MB/s \n", + "\u001B[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.3.5)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.64.0)\n", + "Collecting aiohttp\n", + " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", + "\u001B[K |████████████████████████████████| 1.1 MB 48.8 MB/s \n", + "\u001B[?25hRequirement already satisfied: pyarrow>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n", + "Collecting responses<0.19\n", + " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.2.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.6.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", + "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n", + " Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n", + "\u001B[K |████████████████████████████████| 127 kB 59.6 MB/s \n", + "\u001B[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.4.0)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.12)\n", + "Collecting asynctest==0.13.0\n", + " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", + "Collecting multidict<7.0,>=4.5\n", + " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", + "\u001B[K |████████████████████████████████| 94 kB 3.7 MB/s \n", + "\u001B[?25hCollecting yarl<2.0,>=1.0\n", + " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", + "\u001B[K |████████████████████████████████| 271 kB 57.7 MB/s \n", + "\u001B[?25hCollecting async-timeout<5.0,>=4.0.0a3\n", + " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", + "Collecting frozenlist>=1.1.1\n", + " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", + "\u001B[K |████████████████████████████████| 144 kB 62.5 MB/s \n", + "\u001B[?25hCollecting aiosignal>=1.1.2\n", + " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.8.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2022.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", + "Installing collected packages: multidict, frozenlist, yarl, urllib3, asynctest, async-timeout, aiosignal, fsspec, aiohttp, xxhash, responses, datasets\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 1.24.3\n", + " Uninstalling urllib3-1.24.3:\n", + " Successfully uninstalled urllib3-1.24.3\n", + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "kapre 0.3.7 requires tensorflow>=2.0.0, which is not installed.\n", + "datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\u001B[0m\n", + "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-2.1.0 frozenlist-1.3.0 fsspec-2022.3.0 multidict-6.0.2 responses-0.18.0 urllib3-1.25.11 xxhash-3.0.0 yarl-1.7.2\n", + "Collecting git+https://github.com/huggingface/nlp\n", + " Cloning https://github.com/huggingface/nlp to /tmp/pip-req-build-zp5gwnk_\n", + " Running command git clone -q https://github.com/huggingface/nlp /tmp/pip-req-build-zp5gwnk_\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (1.21.6)\n", + "Requirement already satisfied: pyarrow>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (6.0.1)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.3.4)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (1.3.5)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (2.23.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (4.64.0)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (3.0.0)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.70.12.2)\n", + "Requirement already satisfied: fsspec[http]>=2021.05.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (2022.3.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (3.8.1)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.5.1)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (21.3)\n", + "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.18.0)\n", + "Requirement already satisfied: importlib_metadata in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (4.11.3)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (3.6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (4.2.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (6.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets==2.1.1.dev0) (3.0.8)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (2021.10.8)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (1.25.11)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (3.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (4.0.2)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (2.0.12)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.3.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.2.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (21.4.0)\n", + "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (0.13.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.7.2)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (6.0.2)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib_metadata->datasets==2.1.1.dev0) (3.8.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.1.1.dev0) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.1.1.dev0) (2022.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets==2.1.1.dev0) (1.15.0)\n", + "Building wheels for collected packages: datasets\n", + " Building wheel for datasets (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + " Created wheel for datasets: filename=datasets-2.1.1.dev0-py3-none-any.whl size=327456 sha256=d7cadb2d89edc35e658adda0904ba4c5a7a20d39e04a3060fbdc3888fc78f67f\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-yre84zb1/wheels/b7/b2/b6/a0b4e0d11cb66d705e54f7bb72fdbe910b5e9f198ada8b4347\n", + "Successfully built datasets\n", + "Installing collected packages: datasets\n", + " Attempting uninstall: datasets\n", + " Found existing installation: datasets 2.1.0\n", + " Uninstalling datasets-2.1.0:\n", + " Successfully uninstalled datasets-2.1.0\n", + "Successfully installed datasets-2.1.1.dev0\n" + ] } + ], + "source": [ + "# We won't need TensorFlow here\n", + "!pip uninstall -y tensorflow\n", + "# Install `transformers` from master\n", + "!pip install git+https://github.com/huggingface/transformers\n", + "!pip list | grep -E 'transformers|tokenizers'\n", + "!pip install nlp==0.2.0\n", + "!pip install datasets\n", + "!pip install git+https://github.com/huggingface/nlp\n", + "\n", + "# transformers version at notebook update --- 2.11.0\n", + "# tokenizers version at notebook update --- 0.8.0rc1" + ] }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "Install the correct dependencies on HuggingFace transformer and ternsorflow" - ], - "metadata": { - "id": "xIT_uHUdThub" - } - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "f3KWTckbTUs2", - "outputId": "2012bedd-8fb1-4909-d4bc-1f0ce0ad416e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Found existing installation: tensorflow 2.8.0\n", - "Uninstalling tensorflow-2.8.0:\n", - " Successfully uninstalled tensorflow-2.8.0\n", - "Collecting git+https://github.com/huggingface/transformers\n", - " Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-nl4u3bjj\n", - " Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-nl4u3bjj\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (2.23.0)\n", - "Collecting huggingface-hub<1.0,>=0.1.0\n", - " Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)\n", - "\u001b[K |████████████████████████████████| 77 kB 4.2 MB/s \n", - "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (2019.12.20)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (4.11.3)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (21.3)\n", - "Collecting pyyaml>=5.1\n", - " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", - "\u001b[K |████████████████████████████████| 596 kB 16.8 MB/s \n", - "\u001b[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1\n", - " Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n", - "\u001b[K |████████████████████████████████| 6.6 MB 42.7 MB/s \n", - "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (3.6.0)\n", - "Collecting sacremoses\n", - " Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)\n", - "\u001b[K |████████████████████████████████| 895 kB 44.5 MB/s \n", - "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (4.64.0)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.19.0.dev0) (1.21.6)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.1.0->transformers==4.19.0.dev0) (4.2.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers==4.19.0.dev0) (3.0.8)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers==4.19.0.dev0) (3.8.0)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (1.24.3)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (2021.10.8)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers==4.19.0.dev0) (3.0.4)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (1.1.0)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (7.1.2)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers==4.19.0.dev0) (1.15.0)\n", - "Building wheels for collected packages: transformers\n", - " Building wheel for transformers (PEP 517) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for transformers: filename=transformers-4.19.0.dev0-py3-none-any.whl size=4040857 sha256=84073fb7ad0bd8a06a08636e44e15aadf97eb4dd26c2d2a0e3dfc90232f8158b\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-ii8u1on2/wheels/35/2e/a7/d819e3310040329f0f47e57c9e3e7a7338aa5e74c49acfe522\n", - "Successfully built transformers\n", - "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", - " Attempting uninstall: pyyaml\n", - " Found existing installation: PyYAML 3.13\n", - " Uninstalling PyYAML-3.13:\n", - " Successfully uninstalled PyYAML-3.13\n", - "Successfully installed huggingface-hub-0.5.1 pyyaml-6.0 sacremoses-0.0.49 tokenizers-0.12.1 transformers-4.19.0.dev0\n", - "tokenizers 0.12.1\n", - "transformers 4.19.0.dev0\n", - "Collecting nlp==0.2.0\n", - " Downloading nlp-0.2.0-py3-none-any.whl (857 kB)\n", - "\u001b[K |████████████████████████████████| 857 kB 8.9 MB/s \n", - "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (3.6.0)\n", - "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (2.23.0)\n", - "Requirement already satisfied: pyarrow>=0.16.0 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (6.0.1)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (1.21.6)\n", - "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (4.64.0)\n", - "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from nlp==0.2.0) (0.3.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2021.10.8)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (1.24.3)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->nlp==0.2.0) (2.10)\n", - "Installing collected packages: nlp\n", - "Successfully installed nlp-0.2.0\n", - "Collecting datasets\n", - " Downloading datasets-2.1.0-py3-none-any.whl (325 kB)\n", - "\u001b[K |████████████████████████████████| 325 kB 8.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.4)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n", - "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.12.2)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.21.6)\n", - "Collecting fsspec[http]>=2021.05.0\n", - " Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)\n", - "\u001b[K |████████████████████████████████| 136 kB 60.7 MB/s \n", - "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n", - "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.11.3)\n", - "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (0.5.1)\n", - "Collecting xxhash\n", - " Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", - "\u001b[K |████████████████████████████████| 212 kB 49.9 MB/s \n", - "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.3.5)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.64.0)\n", - "Collecting aiohttp\n", - " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", - "\u001b[K |████████████████████████████████| 1.1 MB 48.8 MB/s \n", - "\u001b[?25hRequirement already satisfied: pyarrow>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (6.0.1)\n", - "Collecting responses<0.19\n", - " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.2.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.6.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.8)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n", - "Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n", - " Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n", - "\u001b[K |████████████████████████████████| 127 kB 59.6 MB/s \n", - "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.4.0)\n", - "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.12)\n", - "Collecting asynctest==0.13.0\n", - " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n", - "Collecting multidict<7.0,>=4.5\n", - " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", - "\u001b[K |████████████████████████████████| 94 kB 3.7 MB/s \n", - "\u001b[?25hCollecting yarl<2.0,>=1.0\n", - " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", - "\u001b[K |████████████████████████████████| 271 kB 57.7 MB/s \n", - "\u001b[?25hCollecting async-timeout<5.0,>=4.0.0a3\n", - " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n", - "Collecting frozenlist>=1.1.1\n", - " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", - "\u001b[K |████████████████████████████████| 144 kB 62.5 MB/s \n", - "\u001b[?25hCollecting aiosignal>=1.1.2\n", - " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.8.0)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n", - "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2022.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", - "Installing collected packages: multidict, frozenlist, yarl, urllib3, asynctest, async-timeout, aiosignal, fsspec, aiohttp, xxhash, responses, datasets\n", - " Attempting uninstall: urllib3\n", - " Found existing installation: urllib3 1.24.3\n", - " Uninstalling urllib3-1.24.3:\n", - " Successfully uninstalled urllib3-1.24.3\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "kapre 0.3.7 requires tensorflow>=2.0.0, which is not installed.\n", - "datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\u001b[0m\n", - "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-2.1.0 frozenlist-1.3.0 fsspec-2022.3.0 multidict-6.0.2 responses-0.18.0 urllib3-1.25.11 xxhash-3.0.0 yarl-1.7.2\n", - "Collecting git+https://github.com/huggingface/nlp\n", - " Cloning https://github.com/huggingface/nlp to /tmp/pip-req-build-zp5gwnk_\n", - " Running command git clone -q https://github.com/huggingface/nlp /tmp/pip-req-build-zp5gwnk_\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (1.21.6)\n", - "Requirement already satisfied: pyarrow>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (6.0.1)\n", - "Requirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.3.4)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (1.3.5)\n", - "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (2.23.0)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (4.64.0)\n", - "Requirement already satisfied: xxhash in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (3.0.0)\n", - "Requirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.70.12.2)\n", - "Requirement already satisfied: fsspec[http]>=2021.05.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (2022.3.0)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (3.8.1)\n", - "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.5.1)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (21.3)\n", - "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (0.18.0)\n", - "Requirement already satisfied: importlib_metadata in /usr/local/lib/python3.7/dist-packages (from datasets==2.1.1.dev0) (4.11.3)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (3.6.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (4.2.0)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets==2.1.1.dev0) (6.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets==2.1.1.dev0) (3.0.8)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (2021.10.8)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (1.25.11)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets==2.1.1.dev0) (3.0.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (4.0.2)\n", - "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (2.0.12)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.3.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.2.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (21.4.0)\n", - "Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (0.13.0)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (1.7.2)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets==2.1.1.dev0) (6.0.2)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib_metadata->datasets==2.1.1.dev0) (3.8.0)\n", - "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.1.1.dev0) (2.8.2)\n", - "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets==2.1.1.dev0) (2022.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets==2.1.1.dev0) (1.15.0)\n", - "Building wheels for collected packages: datasets\n", - " Building wheel for datasets (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for datasets: filename=datasets-2.1.1.dev0-py3-none-any.whl size=327456 sha256=d7cadb2d89edc35e658adda0904ba4c5a7a20d39e04a3060fbdc3888fc78f67f\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-yre84zb1/wheels/b7/b2/b6/a0b4e0d11cb66d705e54f7bb72fdbe910b5e9f198ada8b4347\n", - "Successfully built datasets\n", - "Installing collected packages: datasets\n", - " Attempting uninstall: datasets\n", - " Found existing installation: datasets 2.1.0\n", - " Uninstalling datasets-2.1.0:\n", - " Successfully uninstalled datasets-2.1.0\n", - "Successfully installed datasets-2.1.1.dev0\n" - ] - } - ], - "source": [ - "# We won't need TensorFlow here\n", - "!pip uninstall -y tensorflow\n", - "# Install `transformers` from master\n", - "!pip install git+https://github.com/huggingface/transformers\n", - "!pip list | grep -E 'transformers|tokenizers'\n", - "!pip install nlp==0.2.0\n", - "!pip install datasets\n", - "!pip install git+https://github.com/huggingface/nlp\n", - "\n", - "# transformers version at notebook update --- 2.11.0\n", - "# tokenizers version at notebook update --- 0.8.0rc1" - ] - }, - { - "cell_type": "markdown", - "source": [ - "Fetch datasets" - ], - "metadata": { - "id": "b4Jowf-vf8Ck" - } - }, - { - "cell_type": "code", - "source": [ - "import os\n", - "import tokenize\n", - "import dis\n", - "import sys\n", - "import re\n", - "import keyword\n", - "import pandas as pd\n", - "import ast\n", - "import torch\n", - "import signal\n", - "from functools import wraps\n", - "\n", - "def multireplace(string, replacements, ignore_case=False):\n", - " \"\"\"\n", - " Given a string and a replacement map, it returns the replaced string.\n", - " :param str string: string to execute replacements on\n", - " :param dict replacements: replacement dictionary {value to find: value to replace}\n", - " :param bool ignore_case: whether the match should be case insensitive\n", - " :rtype: str\n", - " \"\"\"\n", - " # If case insensitive, we need to normalize the old string so that later a replacement\n", - " # can be found. For instance with {\"HEY\": \"lol\"} we should match and find a replacement for \"hey\",\n", - " # \"HEY\", \"hEy\", etc.\n", - " if ignore_case:\n", - " def normalize_old(s):\n", - " return s.lower()\n", - " re_mode = re.IGNORECASE\n", - " else:\n", - " def normalize_old(s):\n", - " return s\n", - " re_mode = 0\n", - "\n", - " replacements = {normalize_old(key): val for key, val in replacements.items()}\n", - " \n", - " # Place longer ones first to keep shorter substrings from matching where the longer ones should take place\n", - " # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce\n", - " # 'hey ABC' and not 'hey ABc'\n", - " rep_sorted = sorted(replacements, key=len, reverse=True)\n", - " rep_escaped = map(re.escape, rep_sorted)\n", - " \n", - " # Create a big OR regex that matches any of the substrings to replace\n", - " pattern = re.compile(\"|\".join(rep_escaped), re_mode)\n", - " \n", - " # For each match, look up the new string in the replacements, being the key the normalized old string\n", - " return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)\n", - "\n", - "\n", - "def convert(file, output_file):\n", - " with open (file, \"r\") as f:\n", - " text = f.read() \n", - "\n", - " replacements = {}\n", - " for node in ast.iter_child_nodes(ast.parse(text)):\n", - " if isinstance(node, ast.ImportFrom):\n", - " replacements.update({node.module: 'MODULE'})\n", - " if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):\n", - " for i, v in enumerate(node.names):\n", - " if(node.names[i].asname):\n", - " replacements.update({node.names[i].name: 'LIB'}) \n", - " replacements.update({node.names[i].asname: 'ALIAS'})\n", - " else:\n", - " replacements.update({node.names[i].name: 'LIBRARY'})\n", - "\n", - "\n", - " # reomve * from the dictionary (handle from module import * statement)\n", - " replacements.pop('*', None)\n", - " print('List of modules and libraries to replace:\\n', replacements)\n", - "\n", - " with open('med.py','w') as f:\n", - " f.write(multireplace(text, replacements, ignore_case = True))\n", - "\n", - " file = 'med.py'\n", - " with open(file,'rb') as f:\n", - " tokens = list(tokenize.tokenize(f.readline))\n", - " \n", - " ### extract important data from the output of tokenize package\n", - " toks = pd.DataFrame(columns = ['original','type','text', 'line','pos'])\n", - "\n", - " last_line = 0\n", - " last_pos = 0\n", - "\n", - " for token in tokens:\n", - " \n", - " tok_org = token.string\n", - " tok_text = token.string \n", - " tok_type = str(token).split('(')[2].split(')')[0]\n", - "\n", - " # convert keywords to upper\n", - " if keyword.iskeyword(tok_text):\n", - " tok_type = str.upper(tok_text)\n", - " \n", - " #extract operations\n", - " # if tok_type == 'OP':\n", - " # tok_type = tok_text\n", - "\n", - "\n", - " # getting rid of comments and empty lines\n", - " if tok_type in ['NL','NEWLINE','COMMENT']:\n", - " continue\n", - " \n", - " #retrieve the position\n", - " tok_line = token.start[0]\n", - " \n", - " if last_line == tok_line:\n", - " last_pos += 1\n", - " else:\n", - " last_pos = 1\n", - " tok_pos = last_pos\n", - " last_line = tok_line\n", - " \n", - " toks = toks.append({'type':tok_type,\n", - " 'original':tok_org,\n", - " 'text':tok_text,\n", - " 'line':tok_line,\n", - " 'pos':tok_pos},ignore_index=True)\n", - "\n", - "\n", - " # remove encoding lines and end of file\n", - " toks.line = toks.line.astype('int')\n", - " toks.pos = toks.pos.astype('int')\n", - " toks = toks.loc[~((toks.type == 'ENCODING') | (toks.type == 'ENDMARKER'))]\n", - " toks['doc'] = (toks.text.str.contains('\"\"\"') | toks.text.str.contains(\"'''\"))\n", - " toks = toks.loc[~(toks.doc)].drop(['doc'],axis=1)\n", - "\n", - " toks.head(20)\n", - "\n", - " indent = 0\n", - " last_line = 0\n", - "\n", - " for index,row in toks.iterrows():\n", - " if row.type == \"INDENT\":\n", - " indent +=1\n", - " continue\n", - " if row.type == \"DEDENT\":\n", - " indent -=1\n", - " continue\n", - " if row.line != last_line:\n", - " last_line = row.line\n", - " toks = toks.append({'type':'\\n'+indent*'\\t',\n", - " 'text':'\\n'+indent*'\\t',\n", - " 'line':row.line,\n", - " 'pos':row.pos-1},ignore_index=True)\n", - "\n", - " toks = toks.loc[~((toks.type=='INDENT') | (toks.type=='DEDENT'))]\n", - " toks = toks.sort_values(['line','pos']).reset_index(drop=True)\n", - "\n", - "\n", - " # drop the first row (empty line)\n", - " toks.drop(toks.index[:1], inplace=True)\n", - "\n", - " toks.head(20)\n", - "\n", - " with open(file,'r') as f:\n", - " src = f.read()\n", - "\n", - " stdout_backup = sys.stdout\n", - " sys.stdout = open('dis.txt','w')\n", - " dis.dis(src)\n", - " sys.stdout = stdout_backup\n", - "\n", - " with open('dis.txt','r') as f:\n", - " lines = f.readlines()\n", - "\n", - " # find global variables\n", - " glbls = [].copy() \n", - " for l in lines:\n", - " clean = l.replace('>>',' ').strip().split()\n", - " if len(clean):\n", - " try:\n", - " int(clean[1])\n", - " line = int(clean[0])\n", - " except:\n", - " clean = [str(line)]+clean\n", - " if 'LOAD_GLOBAL' in clean:\n", - " print('found a global!')\n", - " glbls.append((int(clean[0]),clean[-1].replace('(','').replace(')','')))\n", - "\n", - " for l,n in glbls:\n", - " toks.loc[(toks.line==l) & (toks.text==n),'type'] = 'GLOBAL_VARIABLE'\n", - "\n", - " toks .head(10) \n", - "\n", - " text_imports = ' '.join(list(toks.text)).replace('\\n ','\\n').replace(' \\n','\\n').replace('\\t ','\\t').replace(' . ','.').replace(' (','(')\n", - " text_imports = multireplace(text_imports, replacements, ignore_case = True)\n", - "\n", - " with open('normalized_textual_file.py','w') as f:\n", - " f.write(text_imports)\n", - "\n", - " toks.type = toks.apply(lambda x: x['text'] if str(x['text']) in ['LIBRARY','LIB','ALIAS','MODULE'] else x['type'], axis = 1)\n", - " code_converted = ' '.join(list(toks.type)).replace('\\n ','\\n').replace(' \\n','\\n').replace('\\t ','\\t').replace(' . ','.').replace(' (','(')\n", - "\n", - " final_replacements = {'GLOBAL_VARIABLE(':'FUNCTION_CALL(', \n", - " # 'NAME.NAME':'NAME',\n", - " 'NAME(':'FUNCTION_CALL(',\n", - " 'NAME':'LOCAL_VARIABLE'}\n", - "\n", - " code_converted = multireplace(code_converted, final_replacements, ignore_case = False)\n", - "\n", - " with open(output_file,'w') as f:\n", - " f.write(code_converted)\n", - "\n", - "\n", - "WEIGHT_MATRIX = {\n", - " 'NUMBER' : [1.625, 1.25, 1.125],\n", - " 'NAME' : [1.625, 1.125, 1.5],\n", - " 'LOCAL_VARIABLE' : [1.625, 1.125, 1.5],\n", - " 'FUNCTION_NAME' : [1.625, 1.25, 1.5]\n", - " }\n", - "\n", - "input_file = \"/tmp/input_file.txt\"\n", - "output_file = \"/tmp/output_file.txt\"\n", - "def reranking_layer(outputs, context, tokenizer):\n", - "\n", - " with open(input_file, 'w') as f:\n", - " f.write(context);\n", - " \n", - " convert(file_path=input_file, output_file=output_file)\n", - " with open(output_file, 'rb') as context:\n", - " inputs = list(zip(tokenizer(input_file), tokenizer(output_file)))\n", - " for item in inputs:\n", - " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(WEIGHT_MATRIX[item[1]]))\n", - "\n" - ], - "metadata": { - "id": "PGHNJPreFmOw" - }, - "execution_count": 13, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "convert(\"./sample_data/data/peakfinder.py\", \"./converted_train.txt\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cNqN20xrhkGq", - "outputId": "7e041a8c-eb0a-4648-9de0-3c1ffc6440e6" - }, - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "List of modules and libraries to replace:\n", - " {'pylab': 'MODULE', 'rtlsdr': 'MODULE'}\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# pretrain dataset\n", - "#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/pretrain_dataset.zip\n", - "#!unzip 'pretrain_dataset.zip'\n", - "\n", - "# converted dataset\n", - "#! wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/converted_dataset.zip\n", - "#! unzip 'converted_dataset.zip'\n", - "\n", - "# test dataset\n", - "#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/finetune_eval_dataset.zip\n", - "#!unzip 'finetune_eval_dataset.zip'" - ], - "metadata": { - "id": "sEmqUukXf__k" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Train a customised python byte-level Byte-pair encoding tokenizer. " - ], - "metadata": { - "id": "M0wmpgCxUIF3" - } + { + "cell_type": "markdown", + "source": [ + "Fetch datasets" + ], + "metadata": { + "id": "b4Jowf-vf8Ck" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "import tokenize\n", + "import dis\n", + "import sys\n", + "import re\n", + "import keyword\n", + "import pandas as pd\n", + "import ast\n", + "import torch\n", + "import signal\n", + "from functools import wraps\n", + "\n", + "def multireplace(string, replacements, ignore_case=False):\n", + " \"\"\"\n", + " Given a string and a replacement map, it returns the replaced string.\n", + " :param str string: string to execute replacements on\n", + " :param dict replacements: replacement dictionary {value to find: value to replace}\n", + " :param bool ignore_case: whether the match should be case insensitive\n", + " :rtype: str\n", + " \"\"\"\n", + " # If case insensitive, we need to normalize the old string so that later a replacement\n", + " # can be found. For instance with {\"HEY\": \"lol\"} we should match and find a replacement for \"hey\",\n", + " # \"HEY\", \"hEy\", etc.\n", + " if ignore_case:\n", + " def normalize_old(s):\n", + " return s.lower()\n", + " re_mode = re.IGNORECASE\n", + " else:\n", + " def normalize_old(s):\n", + " return s\n", + " re_mode = 0\n", + "\n", + " replacements = {normalize_old(key): val for key, val in replacements.items()}\n", + " \n", + " # Place longer ones first to keep shorter substrings from matching where the longer ones should take place\n", + " # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce\n", + " # 'hey ABC' and not 'hey ABc'\n", + " rep_sorted = sorted(replacements, key=len, reverse=True)\n", + " rep_escaped = map(re.escape, rep_sorted)\n", + " \n", + " # Create a big OR regex that matches any of the substrings to replace\n", + " pattern = re.compile(\"|\".join(rep_escaped), re_mode)\n", + " \n", + " # For each match, look up the new string in the replacements, being the key the normalized old string\n", + " return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)\n", + "\n", + "\n", + "def convert(file, output_file):\n", + " with open (file, \"r\") as f:\n", + " text = f.read() \n", + "\n", + " replacements = {}\n", + " for node in ast.iter_child_nodes(ast.parse(text)):\n", + " if isinstance(node, ast.ImportFrom):\n", + " replacements.update({node.module: 'MODULE'})\n", + " if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):\n", + " for i, v in enumerate(node.names):\n", + " if(node.names[i].asname):\n", + " replacements.update({node.names[i].name: 'LIB'}) \n", + " replacements.update({node.names[i].asname: 'ALIAS'})\n", + " else:\n", + " replacements.update({node.names[i].name: 'LIBRARY'})\n", + "\n", + "\n", + " # reomve * from the dictionary (handle from module import * statement)\n", + " replacements.pop('*', None)\n", + " print('List of modules and libraries to replace:\\n', replacements)\n", + "\n", + " with open('med.py','w') as f:\n", + " f.write(multireplace(text, replacements, ignore_case = True))\n", + "\n", + " file = 'med.py'\n", + " with open(file,'rb') as f:\n", + " tokens = list(tokenize.tokenize(f.readline))\n", + " \n", + " ### extract important data from the output of tokenize package\n", + " toks = pd.DataFrame(columns = ['original','type','text', 'line','pos'])\n", + "\n", + " last_line = 0\n", + " last_pos = 0\n", + "\n", + " for token in tokens:\n", + " \n", + " tok_org = token.string\n", + " tok_text = token.string \n", + " tok_type = str(token).split('(')[2].split(')')[0]\n", + "\n", + " # convert keywords to upper\n", + " if keyword.iskeyword(tok_text):\n", + " tok_type = str.upper(tok_text)\n", + " \n", + " #extract operations\n", + " # if tok_type == 'OP':\n", + " # tok_type = tok_text\n", + "\n", + "\n", + " # getting rid of comments and empty lines\n", + " if tok_type in ['NL','NEWLINE','COMMENT']:\n", + " continue\n", + " \n", + " #retrieve the position\n", + " tok_line = token.start[0]\n", + " \n", + " if last_line == tok_line:\n", + " last_pos += 1\n", + " else:\n", + " last_pos = 1\n", + " tok_pos = last_pos\n", + " last_line = tok_line\n", + " \n", + " toks = toks.append({'type':tok_type,\n", + " 'original':tok_org,\n", + " 'text':tok_text,\n", + " 'line':tok_line,\n", + " 'pos':tok_pos},ignore_index=True)\n", + "\n", + "\n", + " # remove encoding lines and end of file\n", + " toks.line = toks.line.astype('int')\n", + " toks.pos = toks.pos.astype('int')\n", + " toks = toks.loc[~((toks.type == 'ENCODING') | (toks.type == 'ENDMARKER'))]\n", + " toks['doc'] = (toks.text.str.contains('\"\"\"') | toks.text.str.contains(\"'''\"))\n", + " toks = toks.loc[~(toks.doc)].drop(['doc'],axis=1)\n", + "\n", + " toks.head(20)\n", + "\n", + " indent = 0\n", + " last_line = 0\n", + "\n", + " for index,row in toks.iterrows():\n", + " if row.type == \"INDENT\":\n", + " indent +=1\n", + " continue\n", + " if row.type == \"DEDENT\":\n", + " indent -=1\n", + " continue\n", + " if row.line != last_line:\n", + " last_line = row.line\n", + " toks = toks.append({'type':'\\n'+indent*'\\t',\n", + " 'text':'\\n'+indent*'\\t',\n", + " 'line':row.line,\n", + " 'pos':row.pos-1},ignore_index=True)\n", + "\n", + " toks = toks.loc[~((toks.type=='INDENT') | (toks.type=='DEDENT'))]\n", + " toks = toks.sort_values(['line','pos']).reset_index(drop=True)\n", + "\n", + "\n", + " # drop the first row (empty line)\n", + " toks.drop(toks.index[:1], inplace=True)\n", + "\n", + " toks.head(20)\n", + "\n", + " with open(file,'r') as f:\n", + " src = f.read()\n", + "\n", + " stdout_backup = sys.stdout\n", + " sys.stdout = open('dis.txt','w')\n", + " dis.dis(src)\n", + " sys.stdout = stdout_backup\n", + "\n", + " with open('dis.txt','r') as f:\n", + " lines = f.readlines()\n", + "\n", + " # find global variables\n", + " glbls = [].copy() \n", + " for l in lines:\n", + " clean = l.replace('>>',' ').strip().split()\n", + " if len(clean):\n", + " try:\n", + " int(clean[1])\n", + " line = int(clean[0])\n", + " except:\n", + " clean = [str(line)]+clean\n", + " if 'LOAD_GLOBAL' in clean:\n", + " print('found a global!')\n", + " glbls.append((int(clean[0]),clean[-1].replace('(','').replace(')','')))\n", + "\n", + " for l,n in glbls:\n", + " toks.loc[(toks.line==l) & (toks.text==n),'type'] = 'GLOBAL_VARIABLE'\n", + "\n", + " toks .head(10) \n", + "\n", + " text_imports = ' '.join(list(toks.text)).replace('\\n ','\\n').replace(' \\n','\\n').replace('\\t ','\\t').replace(' . ','.').replace(' (','(')\n", + " text_imports = multireplace(text_imports, replacements, ignore_case = True)\n", + "\n", + " with open('normalized_textual_file.py','w') as f:\n", + " f.write(text_imports)\n", + "\n", + " toks.type = toks.apply(lambda x: x['text'] if str(x['text']) in ['LIBRARY','LIB','ALIAS','MODULE'] else x['type'], axis = 1)\n", + " code_converted = ' '.join(list(toks.type)).replace('\\n ','\\n').replace(' \\n','\\n').replace('\\t ','\\t').replace(' . ','.').replace(' (','(')\n", + "\n", + " final_replacements = {'GLOBAL_VARIABLE(':'FUNCTION_CALL(', \n", + " # 'NAME.NAME':'NAME',\n", + " 'NAME(':'FUNCTION_CALL(',\n", + " 'NAME':'LOCAL_VARIABLE'}\n", + "\n", + " code_converted = multireplace(code_converted, final_replacements, ignore_case = False)\n", + "\n", + " with open(output_file,'w') as f:\n", + " f.write(code_converted)\n", + "\n", + "\n", + "WEIGHT_MATRIX = {\n", + " 'NUMBER' : [1.625, 1.25, 1.125],\n", + " 'NAME' : [1.625, 1.125, 1.5],\n", + " 'LOCAL_VARIABLE' : [1.625, 1.125, 1.5],\n", + " 'FUNCTION_NAME' : [1.625, 1.25, 1.5]\n", + " }\n", + "\n", + "input_file = \"/tmp/input_file.txt\"\n", + "output_file = \"/tmp/output_file.txt\"\n", + "def reranking_layer(outputs, context, tokenizer):\n", + "\n", + " with open(input_file, 'w') as f:\n", + " f.write(context);\n", + " \n", + " convert(file_path=input_file, output_file=output_file)\n", + " with open(output_file, 'rb') as context:\n", + " inputs = list(zip(tokenizer(input_file), tokenizer(output_file)))\n", + " for item in inputs:\n", + " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(WEIGHT_MATRIX[item[1]]))\n", + "\n" + ], + "metadata": { + "id": "PGHNJPreFmOw" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "convert(\"./sample_data/data/peakfinder.py\", \"./converted_train.txt\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "cNqN20xrhkGq", + "outputId": "7e041a8c-eb0a-4648-9de0-3c1ffc6440e6" + }, + "execution_count": 14, + "outputs": [ { - "cell_type": "code", - "source": [ - "from pathlib import Path\n", - "from transformers import AutoTokenizer,TextDataset,DataCollatorForLanguageModeling\n", - "import glob\n", - "import random \n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n" - ], - "metadata": { - "id": "oPq1Bau8UbpB" - }, - "execution_count": 5, - "outputs": [] + "output_type": "stream", + "name": "stdout", + "text": [ + "List of modules and libraries to replace:\n", + " {'pylab': 'MODULE', 'rtlsdr': 'MODULE'}\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# pretrain dataset\n", + "#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/pretrain_dataset.zip\n", + "#!unzip 'pretrain_dataset.zip'\n", + "\n", + "# converted dataset\n", + "#! wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/converted_dataset.zip\n", + "#! unzip 'converted_dataset.zip'\n", + "\n", + "# test dataset\n", + "#!wget https://huggingface.co/rgismondi/python-50k-dedup/blob/main/finetune_eval_dataset.zip\n", + "#!unzip 'finetune_eval_dataset.zip'" + ], + "metadata": { + "id": "sEmqUukXf__k" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Train a customised python byte-level Byte-pair encoding tokenizer. " + ], + "metadata": { + "id": "M0wmpgCxUIF3" + } + }, + { + "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from transformers import AutoTokenizer,TextDataset,DataCollatorForLanguageModeling\n", + "import glob\n", + "import random \n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n" + ], + "metadata": { + "id": "oPq1Bau8UbpB" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "paths = [str(x) for x in Path(\".\").glob(\"./sample_data/data/*.py\")]\n", + "converted_paths = []\n", + "for path in paths:\n", + " converted_path = \"./sample_data/converted/\"+ path.split(\"/\").pop().split(\".\")[0] + \".txt\"\n", + " print(converted_path)\n", + " try:\n", + " convert(path, converted_path)\n", + " converted_paths.append(converted_path)\n", + " except:\n", + " pass\n", + "\n", + " \n", + "with open(\"./train.txt\", \"wb\") as train_outfile:\n", + " with open(\"./test.txt\", \"wb\") as test_outfile:\n", + " for f in paths:\n", + " choice = random.random()\n", + " with open(f, \"rb\") as infile:\n", + " if choice > 0.1:\n", + " train_outfile.write(infile.read())\n", + " else:\n", + " test_outfile.write(infile.read())\n", + "\n", + "with open(\"./converted_train.txt\", \"wb\") as train_outfile:\n", + " with open(\"./converted_test.txt\", \"wb\") as test_outfile:\n", + " for f in converted_paths:\n", + " choice = random.random()\n", + " with open(f, \"rb\") as infile:\n", + " if choice > 0.1:\n", + " train_outfile.write(infile.read())\n", + " else:\n", + " test_outfile.write(infile.read())\n" + ], + "metadata": { + "id": "vOXkK5bWYNXz", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "768058fc-877f-4d58-bea9-59933c15e9ad" + }, + "execution_count": 26, + "outputs": [ { - "cell_type": "code", - "source": [ - "paths = [str(x) for x in Path(\".\").glob(\"./sample_data/data/*.py\")]\n", - "converted_paths = []\n", - "for path in paths:\n", - " converted_path = \"./sample_data/converted/\"+ path.split(\"/\").pop().split(\".\")[0] + \".txt\"\n", - " print(converted_path)\n", - " try:\n", - " convert(path, converted_path)\n", - " converted_paths.append(converted_path)\n", - " except:\n", - " pass\n", - "\n", - " \n", - "with open(\"./train.txt\", \"wb\") as train_outfile:\n", - " with open(\"./test.txt\", \"wb\") as test_outfile:\n", - " for f in paths:\n", - " choice = random.random()\n", - " with open(f, \"rb\") as infile:\n", - " if choice > 0.1:\n", - " train_outfile.write(infile.read())\n", - " else:\n", - " test_outfile.write(infile.read())\n", - "\n", - "with open(\"./converted_train.txt\", \"wb\") as train_outfile:\n", - " with open(\"./converted_test.txt\", \"wb\") as test_outfile:\n", - " for f in converted_paths:\n", - " choice = random.random()\n", - " with open(f, \"rb\") as infile:\n", - " if choice > 0.1:\n", - " train_outfile.write(infile.read())\n", - " else:\n", - " test_outfile.write(infile.read())\n" - ], - "metadata": { - "id": "vOXkK5bWYNXz", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "768058fc-877f-4d58-bea9-59933c15e9ad" - }, - "execution_count": 26, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "./sample_data/converted/peakfinder.txt\n", - "List of modules and libraries to replace:\n", - " {'pylab': 'MODULE', 'rtlsdr': 'MODULE'}\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n", - "found a global!\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "./sample_data/converted/peakfinder.txt\n", + "List of modules and libraries to replace:\n", + " {'pylab': 'MODULE', 'rtlsdr': 'MODULE'}\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n", + "found a global!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def load_dataset(train_path,test_path,tokenizer):\n", + " train_dataset = TextDataset(\n", + " tokenizer=tokenizer,\n", + " file_path=train_path,\n", + " block_size=128)\n", + " \n", + " test_dataset = TextDataset(\n", + " tokenizer=tokenizer,\n", + " file_path=test_path,\n", + " block_size=128) \n", + " \n", + " data_collator = DataCollatorForLanguageModeling(\n", + " tokenizer=tokenizer, mlm=False,\n", + " )\n", + " return train_dataset,test_dataset,data_collator\n", + "\n", + "train_dataset,test_dataset,data_collator = load_dataset(\"./train.txt\", \"./test.txt\",tokenizer)\n", + "converted_train_dataset, converted_test_dataset, converted_datacollator = load_dataset(\"./converted_train.txt\", \"./converted_test.txt\",tokenizer)\n", + "#pretrain_raw_files = glob.glob(\"./pretrain_dataset\" + '/**/*.py', recursive=True)\n", + "#pretrain_converted_files = glob.glob(\"./pretrain_converted_dataset\" + '/**/*.py', recursive=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "5Q4kLrTZXTjZ", + "outputId": "15e23b41-9245-454f-d9d6-c169eae0f943" + }, + "execution_count": 16, + "outputs": [ { - "cell_type": "code", - "source": [ - "def load_dataset(train_path,test_path,tokenizer):\n", - " train_dataset = TextDataset(\n", - " tokenizer=tokenizer,\n", - " file_path=train_path,\n", - " block_size=128)\n", - " \n", - " test_dataset = TextDataset(\n", - " tokenizer=tokenizer,\n", - " file_path=test_path,\n", - " block_size=128) \n", - " \n", - " data_collator = DataCollatorForLanguageModeling(\n", - " tokenizer=tokenizer, mlm=False,\n", - " )\n", - " return train_dataset,test_dataset,data_collator\n", - "\n", - "train_dataset,test_dataset,data_collator = load_dataset(\"./train.txt\", \"./test.txt\",tokenizer)\n", - "converted_train_dataset, converted_test_dataset, converted_datacollator = load_dataset(\"./converted_train.txt\", \"./converted_test.txt\",tokenizer)\n", - "#pretrain_raw_files = glob.glob(\"./pretrain_dataset\" + '/**/*.py', recursive=True)\n", - "#pretrain_converted_files = glob.glob(\"./pretrain_converted_dataset\" + '/**/*.py', recursive=True)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5Q4kLrTZXTjZ", - "outputId": "15e23b41-9245-454f-d9d6-c169eae0f943" - }, - "execution_count": 16, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.7/dist-packages/transformers/data/datasets/language_modeling.py:58: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n", - " FutureWarning,\n", - "Token indices sequence length is longer than the specified maximum sequence length for this model (1381 > 1024). Running this sequence through the model will result in indexing errors\n" - ] - } - ] + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/transformers/data/datasets/language_modeling.py:58: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n", + " FutureWarning,\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (1381 > 1024). Running this sequence through the model will result in indexing errors\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "tokenizer(\"for i in range(10)\")[\"input_ids\"]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "0AraoltupXmb", + "outputId": "c90124e7-3695-44af-827d-c355632ec2af" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "tokenizer(\"for i in range(10)\")[\"input_ids\"]" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "0AraoltupXmb", - "outputId": "c90124e7-3695-44af-827d-c355632ec2af" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[1640, 1312, 287, 2837, 7, 940, 8]" - ] - }, - "metadata": {}, - "execution_count": 10 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + "[1640, 1312, 287, 2837, 7, 940, 8]" ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import transformers\n", + "import nlp\n", + "import logging\n", + "from datasets import load_dataset\n", + "from transformers import TextDataset,DataCollatorForLanguageModeling\n", + "\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "dataset_dict = {\n", + " \"token\": train_dataset,\n", + " \"token_type\": train_dataset,\n", + " \"line\": train_dataset,\n", + "}\n", + "\n", + "print(dataset_dict[\"token\"])\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "4ePqOauaqjnZ", + "outputId": "b6959f0d-812a-4cf2-95e4-2718a5be58bd" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import transformers\n", - "import nlp\n", - "import logging\n", - "from datasets import load_dataset\n", - "from transformers import TextDataset,DataCollatorForLanguageModeling\n", - "\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "\n", - "dataset_dict = {\n", - " \"token\": train_dataset,\n", - " \"token_type\": train_dataset,\n", - " \"line\": train_dataset,\n", - "}\n", - "\n", - "print(dataset_dict[\"token\"])\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4ePqOauaqjnZ", - "outputId": "b6959f0d-812a-4cf2-95e4-2718a5be58bd" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from transformers.utils.dummy_pt_objects import GPT2LMHeadModel\n", + "from transformers import GPT2Tokenizer\n", + "from transformers import GPT2Config, EncoderDecoderConfig, EncoderDecoderModel\n", + "\n", + "\n", + "class MultitaskModel(transformers.PreTrainedModel):\n", + " def __init__(self, encoder, taskmodels_dict):\n", + " \"\"\"\n", + " Setting MultitaskModel up as a PretrainedModel allows us\n", + " to take better advantage of Trainer features\n", + " \"\"\"\n", + " super().__init__(transformers.PretrainedConfig())\n", + "\n", + " self.encoder = encoder\n", + " self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)\n", + "\n", + " def _get_models(self):\n", + " return self.taskmodels_dict\n", + "\n", + " @classmethod\n", + " def create(cls, model_name, model_type_dict, model_config_dict):\n", + " \"\"\"\n", + " This creates a MultitaskModel using the model class and config objects\n", + " from single-task models. \n", + "\n", + " We do this by creating each single-task model, and having them share\n", + " the same encoder transformer.\n", + " \"\"\"\n", + " shared_encoder = None\n", + " taskmodels_dict = {}\n", + " for task_name, model_type in model_type_dict.items():\n", + " model = model_type.from_pretrained( \"gpt2\",\n", + " config=model_config_dict[task_name],\n", + " )\n", + " if shared_encoder is None:\n", + " shared_encoder = cls.get_encoder(model)\n", + " else:\n", + " setattr(model, \"encoder\", shared_encoder)\n", + " taskmodels_dict[task_name] = model\n", + " return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)\n", + " \n", + "\n", + " @classmethod\n", + " def get_encoder(cls, model):\n", + " \"\"\"\n", + " The encoder transformer is named differently in each model \"architecture\".\n", + " This method lets us get the name of the encoder attribute\n", + " \"\"\"\n", + " model_class_name = model.__class__.__name__\n", + " if model_class_name.startswith(\"Roberta\"):\n", + " return \"roberta-base\"\n", + " elif model_class_name.startswith(\"GPT2\"):\n", + " config = EncoderDecoderConfig.from_encoder_decoder_configs(model.config, model.config) \n", + " encoder_decoder = EncoderDecoderModel(config=config)\n", + " return encoder_decoder.config.encoder\n", + " else:\n", + " raise KeyError(f\"Add support for new model {model_class_name}\")\n", + " \n", + " def forward(self, task_name, **kwargs):\n", + " return self.taskmodels_dict[task_name](**kwargs)" + ], + "metadata": { + "id": "jJLxsM_H4A6z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model_name = \"gpt2\"\n", + "multitask_model = MultitaskModel.create(\n", + " model_name=model_name,\n", + " model_type_dict={\n", + " \"token\": transformers.AutoModelWithLMHead,\n", + " \"token_type\": transformers.AutoModelWithLMHead,\n", + " \"line\": transformers.AutoModelForSequenceClassification,\n", + " },\n", + " model_config_dict={\n", + " \"token\": transformers.AutoConfig.from_pretrained(model_name),\n", + " \"token_type\": transformers.AutoConfig.from_pretrained(model_name),\n", + " \"line\": transformers.AutoConfig.from_pretrained(model_name),\n", + " },\n", + ")" + ], + "metadata": { + "id": "9bbwa7E74Q0x", + "colab": { + "base_uri": "https://localhost:8080/" }, + "outputId": "4aa35f34-7709-464a-8c6c-bc966a7f3be9" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "from transformers.utils.dummy_pt_objects import GPT2LMHeadModel\n", - "from transformers import GPT2Tokenizer\n", - "from transformers import GPT2Config, EncoderDecoderConfig, EncoderDecoderModel\n", - "\n", - "\n", - "class MultitaskModel(transformers.PreTrainedModel):\n", - " def __init__(self, encoder, taskmodels_dict):\n", - " \"\"\"\n", - " Setting MultitaskModel up as a PretrainedModel allows us\n", - " to take better advantage of Trainer features\n", - " \"\"\"\n", - " super().__init__(transformers.PretrainedConfig())\n", - "\n", - " self.encoder = encoder\n", - " self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)\n", - "\n", - " def _get_models(self):\n", - " return self.taskmodels_dict\n", - "\n", - " @classmethod\n", - " def create(cls, model_name, model_type_dict, model_config_dict):\n", - " \"\"\"\n", - " This creates a MultitaskModel using the model class and config objects\n", - " from single-task models. \n", - "\n", - " We do this by creating each single-task model, and having them share\n", - " the same encoder transformer.\n", - " \"\"\"\n", - " shared_encoder = None\n", - " taskmodels_dict = {}\n", - " for task_name, model_type in model_type_dict.items():\n", - " model = model_type.from_pretrained( \"gpt2\",\n", - " config=model_config_dict[task_name],\n", - " )\n", - " if shared_encoder is None:\n", - " shared_encoder = cls.get_encoder(model)\n", - " else:\n", - " setattr(model, \"encoder\", shared_encoder)\n", - " taskmodels_dict[task_name] = model\n", - " return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)\n", - " \n", - "\n", - " @classmethod\n", - " def get_encoder(cls, model):\n", - " \"\"\"\n", - " The encoder transformer is named differently in each model \"architecture\".\n", - " This method lets us get the name of the encoder attribute\n", - " \"\"\"\n", - " model_class_name = model.__class__.__name__\n", - " if model_class_name.startswith(\"Roberta\"):\n", - " return \"roberta-base\"\n", - " elif model_class_name.startswith(\"GPT2\"):\n", - " config = EncoderDecoderConfig.from_encoder_decoder_configs(model.config, model.config) \n", - " encoder_decoder = EncoderDecoderModel(config=config)\n", - " return encoder_decoder.config.encoder\n", - " else:\n", - " raise KeyError(f\"Add support for new model {model_class_name}\")\n", - " \n", - " def forward(self, task_name, **kwargs):\n", - " return self.taskmodels_dict[task_name](**kwargs)" - ], - "metadata": { - "id": "jJLxsM_H4A6z" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stderr", + "text": [ + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.19.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.19.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.19.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:915: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", + " FutureWarning,\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", + "\n", + "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", + "Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", + "\n", + "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2ForSequenceClassification.\n", + "\n", + "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Check that we have a GPU\n", + "!nvidia-smi\n", + "# Check that PyTorch sees it\n", + "import torch\n", + "torch.cuda.is_available()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "CNu7NKoYpvCP", + "outputId": "ae4d73f1-4657-4580-b85a-bfd1410bee7b" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "model_name = \"gpt2\"\n", - "multitask_model = MultitaskModel.create(\n", - " model_name=model_name,\n", - " model_type_dict={\n", - " \"token\": transformers.AutoModelWithLMHead,\n", - " \"token_type\": transformers.AutoModelWithLMHead,\n", - " \"line\": transformers.AutoModelForSequenceClassification,\n", - " },\n", - " model_config_dict={\n", - " \"token\": transformers.AutoConfig.from_pretrained(model_name),\n", - " \"token_type\": transformers.AutoConfig.from_pretrained(model_name),\n", - " \"line\": transformers.AutoConfig.from_pretrained(model_name),\n", - " },\n", - ")" - ], - "metadata": { - "id": "9bbwa7E74Q0x", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "4aa35f34-7709-464a-8c6c-bc966a7f3be9" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", - "Model config GPT2Config {\n", - " \"_name_or_path\": \"gpt2\",\n", - " \"activation_function\": \"gelu_new\",\n", - " \"architectures\": [\n", - " \"GPT2LMHeadModel\"\n", - " ],\n", - " \"attn_pdrop\": 0.1,\n", - " \"bos_token_id\": 50256,\n", - " \"embd_pdrop\": 0.1,\n", - " \"eos_token_id\": 50256,\n", - " \"initializer_range\": 0.02,\n", - " \"layer_norm_epsilon\": 1e-05,\n", - " \"model_type\": \"gpt2\",\n", - " \"n_ctx\": 1024,\n", - " \"n_embd\": 768,\n", - " \"n_head\": 12,\n", - " \"n_inner\": null,\n", - " \"n_layer\": 12,\n", - " \"n_positions\": 1024,\n", - " \"reorder_and_upcast_attn\": false,\n", - " \"resid_pdrop\": 0.1,\n", - " \"scale_attn_by_inverse_layer_idx\": false,\n", - " \"scale_attn_weights\": true,\n", - " \"summary_activation\": null,\n", - " \"summary_first_dropout\": 0.1,\n", - " \"summary_proj_to_labels\": true,\n", - " \"summary_type\": \"cls_index\",\n", - " \"summary_use_proj\": true,\n", - " \"task_specific_params\": {\n", - " \"text-generation\": {\n", - " \"do_sample\": true,\n", - " \"max_length\": 50\n", - " }\n", - " },\n", - " \"transformers_version\": \"4.19.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 50257\n", - "}\n", - "\n", - "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", - "Model config GPT2Config {\n", - " \"_name_or_path\": \"gpt2\",\n", - " \"activation_function\": \"gelu_new\",\n", - " \"architectures\": [\n", - " \"GPT2LMHeadModel\"\n", - " ],\n", - " \"attn_pdrop\": 0.1,\n", - " \"bos_token_id\": 50256,\n", - " \"embd_pdrop\": 0.1,\n", - " \"eos_token_id\": 50256,\n", - " \"initializer_range\": 0.02,\n", - " \"layer_norm_epsilon\": 1e-05,\n", - " \"model_type\": \"gpt2\",\n", - " \"n_ctx\": 1024,\n", - " \"n_embd\": 768,\n", - " \"n_head\": 12,\n", - " \"n_inner\": null,\n", - " \"n_layer\": 12,\n", - " \"n_positions\": 1024,\n", - " \"reorder_and_upcast_attn\": false,\n", - " \"resid_pdrop\": 0.1,\n", - " \"scale_attn_by_inverse_layer_idx\": false,\n", - " \"scale_attn_weights\": true,\n", - " \"summary_activation\": null,\n", - " \"summary_first_dropout\": 0.1,\n", - " \"summary_proj_to_labels\": true,\n", - " \"summary_type\": \"cls_index\",\n", - " \"summary_use_proj\": true,\n", - " \"task_specific_params\": {\n", - " \"text-generation\": {\n", - " \"do_sample\": true,\n", - " \"max_length\": 50\n", - " }\n", - " },\n", - " \"transformers_version\": \"4.19.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 50257\n", - "}\n", - "\n", - "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", - "Model config GPT2Config {\n", - " \"_name_or_path\": \"gpt2\",\n", - " \"activation_function\": \"gelu_new\",\n", - " \"architectures\": [\n", - " \"GPT2LMHeadModel\"\n", - " ],\n", - " \"attn_pdrop\": 0.1,\n", - " \"bos_token_id\": 50256,\n", - " \"embd_pdrop\": 0.1,\n", - " \"eos_token_id\": 50256,\n", - " \"initializer_range\": 0.02,\n", - " \"layer_norm_epsilon\": 1e-05,\n", - " \"model_type\": \"gpt2\",\n", - " \"n_ctx\": 1024,\n", - " \"n_embd\": 768,\n", - " \"n_head\": 12,\n", - " \"n_inner\": null,\n", - " \"n_layer\": 12,\n", - " \"n_positions\": 1024,\n", - " \"reorder_and_upcast_attn\": false,\n", - " \"resid_pdrop\": 0.1,\n", - " \"scale_attn_by_inverse_layer_idx\": false,\n", - " \"scale_attn_weights\": true,\n", - " \"summary_activation\": null,\n", - " \"summary_first_dropout\": 0.1,\n", - " \"summary_proj_to_labels\": true,\n", - " \"summary_type\": \"cls_index\",\n", - " \"summary_use_proj\": true,\n", - " \"task_specific_params\": {\n", - " \"text-generation\": {\n", - " \"do_sample\": true,\n", - " \"max_length\": 50\n", - " }\n", - " },\n", - " \"transformers_version\": \"4.19.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 50257\n", - "}\n", - "\n", - "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:915: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", - " FutureWarning,\n", - "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", - "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", - "\n", - "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", - "Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config\n", - "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", - "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", - "\n", - "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", - "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", - "All model checkpoint weights were used when initializing GPT2ForSequenceClassification.\n", - "\n", - "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.\n", + "\n" + ] }, { - "cell_type": "code", - "source": [ - "# Check that we have a GPU\n", - "!nvidia-smi\n", - "# Check that PyTorch sees it\n", - "import torch\n", - "torch.cuda.is_available()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CNu7NKoYpvCP", - "outputId": "ae4d73f1-4657-4580-b85a-bfd1410bee7b" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.\n", - "\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "False" - ] - }, - "metadata": {}, - "execution_count": 35 - } + "output_type": "execute_result", + "data": { + "text/plain": [ + "False" ] + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import dataclasses\n", + "from torch.utils.data.dataloader import DataLoader\n", + "from transformers.data.data_collator import DataCollatorForLanguageModeling, InputDataClass, DefaultDataCollator\n", + "from torch.utils.data.distributed import DistributedSampler\n", + "from torch.utils.data.sampler import RandomSampler\n", + "from typing import List, Union, Dict\n", + "from transformers import Trainer\n", + "from random import random\n", + "\n", + "\n", + "class NLPDataCollator(DataCollatorForLanguageModeling):\n", + " \"\"\"\n", + " Extending the existing DataCollator to work with NLP dataset batches\n", + " \"\"\"\n", + " def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:\n", + " first = features[0]\n", + " if isinstance(first, dict):\n", + " # NLP data sets current works presents features as lists of dictionary\n", + " # (one per example), so we will adapt the collate_batch logic for that\n", + " if \"labels\" in first and first[\"labels\"] is not None:\n", + " if first[\"labels\"].dtype == torch.int64:\n", + " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.long)\n", + " else:\n", + " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.float)\n", + " batch = {\"labels\": labels}\n", + " for k, v in first.items():\n", + " if k != \"labels\" and v is not None and not isinstance(v, str):\n", + " batch[k] = torch.stack([f[k] for f in features])\n", + " return batch\n", + " else:\n", + " # otherwise, revert to using the default collate_batch\n", + " return DefaultDataCollator().collate_batch(features)\n", + "\n", + "\n", + "class StrIgnoreDevice(str):\n", + " \"\"\"\n", + " This is a hack. The Trainer is going call .to(device) on every input\n", + " value, but we need to pass in an additional `task_name` string.\n", + " This prevents it from throwing an error\n", + " \"\"\"\n", + " def to(self, device):\n", + " return self\n", + "\n", + "class DataLoaderWithTaskname:\n", + " \"\"\"\n", + " Wrapper around a DataLoader to also yield a task name\n", + " \"\"\"\n", + " def __init__(self, task_name, data_loader):\n", + " self.task_name = task_name\n", + " self.data_loader = data_loader\n", + "\n", + " self.batch_size = data_loader.batch_size\n", + " self.dataset = data_loader.dataset\n", + "\n", + " def __len__(self):\n", + " return len(self.data_loader)\n", + " \n", + " def __iter__(self):\n", + " for batch in self.data_loader:\n", + " batch[\"task_name\"] = StrIgnoreDevice(self.task_name)\n", + " yield batch\n", + "\n", + "\n", + "class MultitaskDataloader:\n", + " \"\"\"\n", + " Data loader that combines and samples from multiple single-task\n", + " data loaders.\n", + " \"\"\"\n", + " def __init__(self, dataloader_dict):\n", + " self.dataloader_dict = dataloader_dict\n", + " self.num_batches_dict = {\n", + " task_name: len(dataloader) \n", + " for task_name, dataloader in self.dataloader_dict.items()\n", + " }\n", + " self.task_name_list = list(self.dataloader_dict)\n", + " self.dataset = [None] * sum(\n", + " len(dataloader.dataset) \n", + " for dataloader in self.dataloader_dict.values()\n", + " )\n", + "\n", + " def __len__(self):\n", + " return sum(self.num_batches_dict.values())\n", + "\n", + " def __iter__(self):\n", + " \"\"\"\n", + " For each batch, sample a task, and yield a batch from the respective\n", + " task Dataloader.\n", + "\n", + " We use size-proportional sampling, but you could easily modify this\n", + " to sample from some-other distribution.\n", + " \"\"\"\n", + " task_choice_list = []\n", + " for i, task_name in enumerate(self.task_name_list):\n", + " task_choice_list += [i] * self.num_batches_dict[task_name]\n", + " task_choice_list = np.array(task_choice_list)\n", + " np.random.shuffle(task_choice_list)\n", + " dataloader_iter_dict = {\n", + " task_name: iter(dataloader) \n", + " for task_name, dataloader in self.dataloader_dict.items()\n", + " }\n", + " for task_choice in task_choice_list:\n", + " task_name = self.task_name_list[task_choice]\n", + " yield next(dataloader_iter_dict[task_name]) \n", + "\n", + "class MultitaskTrainer(transformers.Trainer):\n", + "\n", + " def get_single_train_dataloader(self, task_name, train_dataset):\n", + " \"\"\"\n", + " Create a single-task data loader that also yields task names\n", + " \"\"\"\n", + " if self.train_dataset is None:\n", + " raise ValueError(\"Trainer: training requires a train_dataset.\")\n", + " \n", + " train_sampler = (\n", + " RandomSampler(train_dataset)\n", + " if self.args.local_rank == -1\n", + " else DistributedSampler(train_dataset)\n", + " )\n", + "\n", + " data_loader = DataLoaderWithTaskname(\n", + " task_name=task_name,\n", + " data_loader=DataLoader(\n", + " train_dataset,\n", + " batch_size=self.args.train_batch_size,\n", + " sampler=train_sampler\n", + " ),\n", + " )\n", + "\n", + " return data_loader\n", + "\n", + " def get_train_dataloader(self):\n", + " \"\"\"\n", + " Returns a MultitaskDataloader, which is not actually a Dataloader\n", + " but an iterable that returns a generator that samples from each \n", + " task Dataloader\n", + " \"\"\"\n", + " return MultitaskDataloader({\n", + " task_name: self.get_single_train_dataloader(task_name, task_dataset)\n", + " for task_name, task_dataset in self.train_dataset.items()\n", + " })\n", + " \n", + " def train(self):\n", + " config = transformers.AutoConfig.from_pretrained(\"gpt2\")\n", + " model = transformers.AutoModelWithLMHead.from_pretrained(\"gpt2\", config=config)\n", + " self.trainer = Trainer(\n", + " model=model,\n", + " args=transformers.TrainingArguments(\n", + " output_dir=\"./models/multitask_model\",\n", + " overwrite_output_dir=True,\n", + " learning_rate=1e-5,\n", + " do_train=True,\n", + " num_train_epochs=100,\n", + " # Adjust batch size if this doesn't fit on the Colab GPU\n", + " per_device_train_batch_size=8, \n", + " save_steps=3000,\n", + " ),\n", + " data_collator=data_collator,\n", + " train_dataset=train_dataset,\n", + " )\n", + " self.trainer.train()\n", + "\n", + " def prediction_loop(self):\n", + " return self.trainer.predict()\n", + "\n", + " def compute_loss(self, model, inputs, return_outputs=True):\n", + " labels = inputs.get(\"labels\")\n", + " # forward pass\n", + " outputs = model(**inputs)\n", + " reranking_layer(outputs, inputs._get_value(), tokenizer=tokenizer) #input value is tensor\n", + " logits = outputs.get(\"logits\")\n", + " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))\n", + " loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))\n", + " return (loss, outputs) if return_outputs else loss" + ], + "metadata": { + "id": "dzVHNee8EP-T" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "tzNxUL7gLsYV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "trainer = MultitaskTrainer(\n", + " model=multitask_model,\n", + " args=transformers.TrainingArguments(\n", + " output_dir=\"./models/multitask_model\",\n", + " overwrite_output_dir=True,\n", + " learning_rate=1e-5,\n", + " do_train=True,\n", + " num_train_epochs=100,\n", + " # Adjust batch size if this doesn't fit on the Colab GPU\n", + " per_device_train_batch_size=8, \n", + " save_steps=3000,\n", + " ),\n", + " data_collator=data_collator,\n", + ")\n", + "trainer.train()\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "id": "TJ3CrZfgHGKq", + "outputId": "30aa520a-c8c4-4b53-8659-9f30b068562c" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "import dataclasses\n", - "from torch.utils.data.dataloader import DataLoader\n", - "from transformers.data.data_collator import DataCollatorForLanguageModeling, InputDataClass, DefaultDataCollator\n", - "from torch.utils.data.distributed import DistributedSampler\n", - "from torch.utils.data.sampler import RandomSampler\n", - "from typing import List, Union, Dict\n", - "from transformers import Trainer\n", - "from random import random\n", - "\n", - "\n", - "class NLPDataCollator(DataCollatorForLanguageModeling):\n", - " \"\"\"\n", - " Extending the existing DataCollator to work with NLP dataset batches\n", - " \"\"\"\n", - " def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:\n", - " first = features[0]\n", - " if isinstance(first, dict):\n", - " # NLP data sets current works presents features as lists of dictionary\n", - " # (one per example), so we will adapt the collate_batch logic for that\n", - " if \"labels\" in first and first[\"labels\"] is not None:\n", - " if first[\"labels\"].dtype == torch.int64:\n", - " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.long)\n", - " else:\n", - " labels = torch.tensor([f[\"labels\"] for f in features], dtype=torch.float)\n", - " batch = {\"labels\": labels}\n", - " for k, v in first.items():\n", - " if k != \"labels\" and v is not None and not isinstance(v, str):\n", - " batch[k] = torch.stack([f[k] for f in features])\n", - " return batch\n", - " else:\n", - " # otherwise, revert to using the default collate_batch\n", - " return DefaultDataCollator().collate_batch(features)\n", - "\n", - "\n", - "class StrIgnoreDevice(str):\n", - " \"\"\"\n", - " This is a hack. The Trainer is going call .to(device) on every input\n", - " value, but we need to pass in an additional `task_name` string.\n", - " This prevents it from throwing an error\n", - " \"\"\"\n", - " def to(self, device):\n", - " return self\n", - "\n", - "class DataLoaderWithTaskname:\n", - " \"\"\"\n", - " Wrapper around a DataLoader to also yield a task name\n", - " \"\"\"\n", - " def __init__(self, task_name, data_loader):\n", - " self.task_name = task_name\n", - " self.data_loader = data_loader\n", - "\n", - " self.batch_size = data_loader.batch_size\n", - " self.dataset = data_loader.dataset\n", - "\n", - " def __len__(self):\n", - " return len(self.data_loader)\n", - " \n", - " def __iter__(self):\n", - " for batch in self.data_loader:\n", - " batch[\"task_name\"] = StrIgnoreDevice(self.task_name)\n", - " yield batch\n", - "\n", - "\n", - "class MultitaskDataloader:\n", - " \"\"\"\n", - " Data loader that combines and samples from multiple single-task\n", - " data loaders.\n", - " \"\"\"\n", - " def __init__(self, dataloader_dict):\n", - " self.dataloader_dict = dataloader_dict\n", - " self.num_batches_dict = {\n", - " task_name: len(dataloader) \n", - " for task_name, dataloader in self.dataloader_dict.items()\n", - " }\n", - " self.task_name_list = list(self.dataloader_dict)\n", - " self.dataset = [None] * sum(\n", - " len(dataloader.dataset) \n", - " for dataloader in self.dataloader_dict.values()\n", - " )\n", - "\n", - " def __len__(self):\n", - " return sum(self.num_batches_dict.values())\n", - "\n", - " def __iter__(self):\n", - " \"\"\"\n", - " For each batch, sample a task, and yield a batch from the respective\n", - " task Dataloader.\n", - "\n", - " We use size-proportional sampling, but you could easily modify this\n", - " to sample from some-other distribution.\n", - " \"\"\"\n", - " task_choice_list = []\n", - " for i, task_name in enumerate(self.task_name_list):\n", - " task_choice_list += [i] * self.num_batches_dict[task_name]\n", - " task_choice_list = np.array(task_choice_list)\n", - " np.random.shuffle(task_choice_list)\n", - " dataloader_iter_dict = {\n", - " task_name: iter(dataloader) \n", - " for task_name, dataloader in self.dataloader_dict.items()\n", - " }\n", - " for task_choice in task_choice_list:\n", - " task_name = self.task_name_list[task_choice]\n", - " yield next(dataloader_iter_dict[task_name]) \n", - "\n", - "class MultitaskTrainer(transformers.Trainer):\n", - "\n", - " def get_single_train_dataloader(self, task_name, train_dataset):\n", - " \"\"\"\n", - " Create a single-task data loader that also yields task names\n", - " \"\"\"\n", - " if self.train_dataset is None:\n", - " raise ValueError(\"Trainer: training requires a train_dataset.\")\n", - " \n", - " train_sampler = (\n", - " RandomSampler(train_dataset)\n", - " if self.args.local_rank == -1\n", - " else DistributedSampler(train_dataset)\n", - " )\n", - "\n", - " data_loader = DataLoaderWithTaskname(\n", - " task_name=task_name,\n", - " data_loader=DataLoader(\n", - " train_dataset,\n", - " batch_size=self.args.train_batch_size,\n", - " sampler=train_sampler\n", - " ),\n", - " )\n", - "\n", - " return data_loader\n", - "\n", - " def get_train_dataloader(self):\n", - " \"\"\"\n", - " Returns a MultitaskDataloader, which is not actually a Dataloader\n", - " but an iterable that returns a generator that samples from each \n", - " task Dataloader\n", - " \"\"\"\n", - " return MultitaskDataloader({\n", - " task_name: self.get_single_train_dataloader(task_name, task_dataset)\n", - " for task_name, task_dataset in self.train_dataset.items()\n", - " })\n", - " \n", - " def train(self):\n", - " config = transformers.AutoConfig.from_pretrained(\"gpt2\")\n", - " model = transformers.AutoModelWithLMHead.from_pretrained(\"gpt2\", config=config)\n", - " trainer = Trainer(\n", - " model=model,\n", - " args=transformers.TrainingArguments(\n", - " output_dir=\"./models/multitask_model\",\n", - " overwrite_output_dir=True,\n", - " learning_rate=1e-5,\n", - " do_train=True,\n", - " num_train_epochs=100,\n", - " # Adjust batch size if this doesn't fit on the Colab GPU\n", - " per_device_train_batch_size=8, \n", - " save_steps=3000,\n", - " ),\n", - " data_collator=data_collator,\n", - " train_dataset=train_dataset,\n", - " )\n", - " trainer.train()\n", - "\n", - " def compute_loss(self, model, inputs, return_outputs=True):\n", - " labels = inputs.get(\"labels\")\n", - " # forward pass\n", - " outputs = model(**inputs)\n", - " reranking_layer(outputs, inputs._get_value(), tokenizer=tokenizer) #input value is tensor\n", - " logits = outputs.get(\"logits\")\n", - " loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))\n", - " loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))\n", - " return (loss, outputs) if return_outputs else loss" - ], - "metadata": { - "id": "dzVHNee8EP-T" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "" - ], - "metadata": { - "id": "tzNxUL7gLsYV" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", + "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", + "Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.19.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:915: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", + " FutureWarning,\n", + "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", + "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", + "\n", + "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", + "PyTorch: setting up devices\n", + "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", + "/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " FutureWarning,\n", + "***** Running training *****\n", + " Num examples = 288\n", + " Num Epochs = 100\n", + " Instantaneous batch size per device = 8\n", + " Total train batch size (w. parallel, distributed & accumulation) = 8\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 3600\n" + ] }, { - "cell_type": "code", - "source": [ - "trainer = MultitaskTrainer(\n", - " model=multitask_model,\n", - " args=transformers.TrainingArguments(\n", - " output_dir=\"./models/multitask_model\",\n", - " overwrite_output_dir=True,\n", - " learning_rate=1e-5,\n", - " do_train=True,\n", - " num_train_epochs=100,\n", - " # Adjust batch size if this doesn't fit on the Colab GPU\n", - " per_device_train_batch_size=8, \n", - " save_steps=3000,\n", - " ),\n", - " data_collator=data_collator,\n", - ")\n", - "trainer.train()\n", - "\n" + "output_type": "display_data", + "data": { + "text/plain": [ + "" ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "TJ3CrZfgHGKq", - "outputId": "30aa520a-c8c4-4b53-8659-9f30b068562c" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "PyTorch: setting up devices\n", - "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", - "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n", - "Model config GPT2Config {\n", - " \"_name_or_path\": \"gpt2\",\n", - " \"activation_function\": \"gelu_new\",\n", - " \"architectures\": [\n", - " \"GPT2LMHeadModel\"\n", - " ],\n", - " \"attn_pdrop\": 0.1,\n", - " \"bos_token_id\": 50256,\n", - " \"embd_pdrop\": 0.1,\n", - " \"eos_token_id\": 50256,\n", - " \"initializer_range\": 0.02,\n", - " \"layer_norm_epsilon\": 1e-05,\n", - " \"model_type\": \"gpt2\",\n", - " \"n_ctx\": 1024,\n", - " \"n_embd\": 768,\n", - " \"n_head\": 12,\n", - " \"n_inner\": null,\n", - " \"n_layer\": 12,\n", - " \"n_positions\": 1024,\n", - " \"reorder_and_upcast_attn\": false,\n", - " \"resid_pdrop\": 0.1,\n", - " \"scale_attn_by_inverse_layer_idx\": false,\n", - " \"scale_attn_weights\": true,\n", - " \"summary_activation\": null,\n", - " \"summary_first_dropout\": 0.1,\n", - " \"summary_proj_to_labels\": true,\n", - " \"summary_type\": \"cls_index\",\n", - " \"summary_use_proj\": true,\n", - " \"task_specific_params\": {\n", - " \"text-generation\": {\n", - " \"do_sample\": true,\n", - " \"max_length\": 50\n", - " }\n", - " },\n", - " \"transformers_version\": \"4.19.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 50257\n", - "}\n", - "\n", - "/usr/local/lib/python3.7/dist-packages/transformers/models/auto/modeling_auto.py:915: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", - " FutureWarning,\n", - "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n", - "All model checkpoint weights were used when initializing GPT2LMHeadModel.\n", - "\n", - "All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.\n", - "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.\n", - "PyTorch: setting up devices\n", - "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n", - "/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", - " FutureWarning,\n", - "***** Running training *****\n", - " Num examples = 288\n", - " Num Epochs = 100\n", - " Instantaneous batch size per device = 8\n", - " Total train batch size (w. parallel, distributed & accumulation) = 8\n", - " Gradient Accumulation steps = 1\n", - " Total optimization steps = 3600\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - "
\n", - " \n", - " \n", - " [ 383/3600 1:49:51 < 15:27:36, 0.06 it/s, Epoch 10.61/100]\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining Loss

" - ] - }, - "metadata": {} - } + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [ 383/3600 1:49:51 < 15:27:36, 0.06 it/s, Epoch 10.61/100]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss

" ] - }, - { - "cell_type": "code", - "source": [ - " preds_dict = {}\n", - " for task_name in [\"token\", \"token_type\", \"line\"]:\n", - " eval_dataloader = DataLoaderWithTaskname(\n", - " task_name,\n", - " trainer.get_eval_dataloader(eval_dataset=dataset_dict[task_name])\n", - " )\n", - " print(eval_dataloader.data_loader.collate_fn)\n", - " preds_dict[task_name] = trainer.prediction_loop(\n", - " eval_dataloader, \n", - " description=f\"Validation: {task_name}\",\n", - " )\n", - "\n", - "\n", - " print(preds_dict)" - ], - "metadata": { - "id": "Xgw82zyxp-_5" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from sklearn.metrics import accuracy_score, label_ranking_average_precision_score\n", - "\n", - "accuracy_dict = {}\n", - "mrr_dict = {}\n", - "\n", - "for task_name in [\"token\", \"token_type\", \"line\"]:\n", - " accuracy_dict[task_name] = accuracy_score(preds_dict[task_name].predictions.flatten(),\n", - " preds_dict[task_name].label_ids)\n", - " \n", - " mrr_dict[task_name] = label_ranking_average_precision_score(preds_dict[task_name].predictions.flatten(),\n", - " preds_dict[task_name].label_ids)\n", - " " - ], - "metadata": { - "id": "XWUxUWUVE5Dq" - }, - "execution_count": null, - "outputs": [] + }, + "metadata": {} } - ] + ] + }, + { + "cell_type": "code", + "source": [ + " preds_dict = {}\n", + " for task_name in [\"token\", \"token_type\", \"line\"]:\n", + " eval_dataloader = DataLoaderWithTaskname(\n", + " task_name,\n", + " trainer.get_eval_dataloader(eval_dataset=dataset_dict[task_name])\n", + " )\n", + " print(eval_dataloader.data_loader.collate_fn)\n", + " preds_dict[task_name] = trainer.prediction_loop(\n", + " eval_dataloader, \n", + " description=f\"Validation: {task_name}\",\n", + " )\n", + "\n", + "\n", + " print(preds_dict)" + ], + "metadata": { + "id": "Xgw82zyxp-_5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import accuracy_score, label_ranking_average_precision_score\n", + "\n", + "accuracy_dict = {}\n", + "mrr_dict = {}\n", + "\n", + "for task_name in [\"token\", \"token_type\", \"line\"]:\n", + " accuracy_dict[task_name] = accuracy_score(preds_dict[task_name].predictions.flatten(),\n", + " preds_dict[task_name].label_ids)\n", + " \n", + " mrr_dict[task_name] = label_ranking_average_precision_score(preds_dict[task_name].predictions.flatten(),\n", + " preds_dict[task_name].label_ids)\n", + " " + ], + "metadata": { + "id": "XWUxUWUVE5Dq" + }, + "execution_count": null, + "outputs": [] + } + ] } \ No newline at end of file