{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting nltk\n",
      "  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)\n",
      "     ---------------------------------------- 1.5/1.5 MB 10.5 MB/s eta 0:00:00\n",
      "Collecting regex>=2021.8.3\n",
      "  Downloading regex-2022.9.13-cp38-cp38-win_amd64.whl (267 kB)\n",
      "     ------------------------------------- 267.7/267.7 kB 16.1 MB/s eta 0:00:00\n",
      "Requirement already satisfied: tqdm in c:\\users\\win10\\anaconda3\\envs\\development\\lib\\site-packages (from nltk) (4.62.3)\n",
      "Requirement already satisfied: click in c:\\users\\win10\\anaconda3\\envs\\development\\lib\\site-packages (from nltk) (8.0.3)\n",
      "Requirement already satisfied: joblib in c:\\users\\win10\\anaconda3\\envs\\development\\lib\\site-packages (from nltk) (1.1.0)\n",
      "Requirement already satisfied: colorama in c:\\users\\win10\\anaconda3\\envs\\development\\lib\\site-packages (from click->nltk) (0.4.4)\n",
      "Installing collected packages: regex, nltk\n",
      "Successfully installed nltk-3.7 regex-2022.9.13\n",
      "\n",
      "[notice] A new release of pip available: 22.1.2 -> 22.3\n",
      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
     ]
    }
   ],
   "source": [
    "!pip install nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus=\"\"\"Hello Welcome,to Krish Naik's NLP Tutorials.\n",
    "Please do watch the entire course! to become expert in NLP.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hello Welcome,to Krish Naik's NLP Tutorials.\n",
      "Please do watch the entire course! to become expert in NLP.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "##  Tokenization\n",
    "## Sentence-->paragraphs\n",
    "from nltk.tokenize import sent_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents=sent_tokenize(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hello Welcome,to Krish Naik's NLP Tutorials.\n",
      "Please do watch the entire course!\n",
      "to become expert in NLP.\n"
     ]
    }
   ],
   "source": [
    "for sentence in documents:\n",
    "    print(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Tokenization \n",
    "## Paragraph-->words\n",
    "## sentence--->words\n",
    "from nltk.tokenize import word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Hello',\n",
       " 'Welcome',\n",
       " ',',\n",
       " 'to',\n",
       " 'Krish',\n",
       " 'Naik',\n",
       " \"'s\",\n",
       " 'NLP',\n",
       " 'Tutorials',\n",
       " '.',\n",
       " 'Please',\n",
       " 'do',\n",
       " 'watch',\n",
       " 'the',\n",
       " 'entire',\n",
       " 'course',\n",
       " '!',\n",
       " 'to',\n",
       " 'become',\n",
       " 'expert',\n",
       " 'in',\n",
       " 'NLP',\n",
       " '.']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_tokenize(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Hello', 'Welcome', ',', 'to', 'Krish', 'Naik', \"'s\", 'NLP', 'Tutorials', '.']\n",
      "['Please', 'do', 'watch', 'the', 'entire', 'course', '!']\n",
      "['to', 'become', 'expert', 'in', 'NLP', '.']\n"
     ]
    }
   ],
   "source": [
    "for sentence in documents:\n",
    "    print(word_tokenize(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import wordpunct_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Hello',\n",
       " 'Welcome',\n",
       " ',',\n",
       " 'to',\n",
       " 'Krish',\n",
       " 'Naik',\n",
       " \"'\",\n",
       " 's',\n",
       " 'NLP',\n",
       " 'Tutorials',\n",
       " '.',\n",
       " 'Please',\n",
       " 'do',\n",
       " 'watch',\n",
       " 'the',\n",
       " 'entire',\n",
       " 'course',\n",
       " '!',\n",
       " 'to',\n",
       " 'become',\n",
       " 'expert',\n",
       " 'in',\n",
       " 'NLP',\n",
       " '.']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wordpunct_tokenize(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import TreebankWordTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer=TreebankWordTokenizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Hello',\n",
       " 'Welcome',\n",
       " ',',\n",
       " 'to',\n",
       " 'Krish',\n",
       " 'Naik',\n",
       " \"'s\",\n",
       " 'NLP',\n",
       " 'Tutorials.',\n",
       " 'Please',\n",
       " 'do',\n",
       " 'watch',\n",
       " 'the',\n",
       " 'entire',\n",
       " 'course',\n",
       " '!',\n",
       " 'to',\n",
       " 'become',\n",
       " 'expert',\n",
       " 'in',\n",
       " 'NLP',\n",
       " '.']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.tokenize(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
