diff --git a/Hate_Speech_Ethiopia.ipynb b/Hate_Speech_Ethiopia.ipynb new file mode 100644 index 0000000..99bdeb6 --- /dev/null +++ b/Hate_Speech_Ethiopia.ipynb @@ -0,0 +1,1410 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Hate Speech - Ethiopia.ipynb", + "provenance": [], + "authorship_tag": "ABX9TyMrbg9XCwH/rahVzjhlwI0Y", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0GZ0Y0S_Nv3m" + }, + "source": [ + "import pandas as pd" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "d-AgFL-yR7eR" + }, + "source": [ + "terms = pd.read_csv('/content/Terms.csv')" + ], + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "id": "pucm09BHTbd3", + "outputId": "3a2a2aac-de5c-4285-cee3-d8b1cc1c64ef" + }, + "source": [ + "terms.head()" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
List_the_hate_speech_phrase_with_a_commaa_Terma_Term_001a_Term_002
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX
1rienrienNaNNaN
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;
\n", + "
" + ], + "text/plain": [ + " List_the_hate_speech_phrase_with_a_comma ... a_Term_002\n", + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n", + "1 rien ... NaN\n", + "2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n", + "\n", + "[5 rows x 4 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rbIDpJd1WMaN", + "outputId": "d4a3cda3-2033-4b31-8b86-28408f0e13c2" + }, + "source": [ + "print(terms)" + ], + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "text": [ + " List_the_hate_speech_phrase_with_a_comma ... a_Term_002\n", + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n", + "1 rien ... NaN\n", + "2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n", + ".. ... ... ...\n", + "477 chamarocka, kholo, nda kangue ... nda kangue\n", + "478 Amdjoudoul iwacki koudjoumass ... koundjou mass\n", + "479 Bengue, arabo, pro français, pro russes, moutons, ... Cannibales\n", + "480 Gangster ... NaN\n", + "481 - Lawa Lawa\\n- Benguè\\n- Bandaï ... Bandaï\n", + "\n", + "[482 rows x 4 columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "uUU-nueeWSqW", + "outputId": "31ea9473-7207-478b-cb7d-fac47b8ccf92" + }, + "source": [ + "terms.describe()" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
List_the_hate_speech_phrase_with_a_commaa_Terma_Term_001a_Term_002
count482482284185
unique481403246168
topETRANGER ; RELIGION ; POLITIQUE ;ETRANGERBENGUEBENGUE
freq21155
\n", + "
" + ], + "text/plain": [ + " List_the_hate_speech_phrase_with_a_comma a_Term a_Term_001 a_Term_002\n", + "count 482 482 284 185\n", + "unique 481 403 246 168\n", + "top ETRANGER ; RELIGION ; POLITIQUE ; ETRANGER BENGUE BENGUE\n", + "freq 2 11 5 5" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "q4cU4Q_hXdsQ" + }, + "source": [ + "terms.rename(columns={'List_the_hate_speech_phrase_with_a_comma':'terms_list', 'a_Term':'term_1', 'a_Term_001':'term_2','a_Term_002':'term_3'}, inplace=True)" + ], + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 781 + }, + "id": "bS2p70oZX7cg", + "outputId": "bbee9b17-d4c8-4562-8528-3e06f1821554" + }, + "source": [ + "terms.head(20)" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
terms_listterm_1term_2term_3
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX
1rienrienNaNNaN
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;
5MO KPA GUI A ALBUNOS ; MO GUI MBI FURU NA MO ...MO KPA GUI A ALBUNOSNaNNaN
6RELIGION ; ETHNIQUE , FOOT-BALLRELIGIONETHNIQUENaN
7VOYOUTISME ; IMPURE ; MECREANT ;VOYOUTISMEIMPURENaN
8KPANDA ; LE TI BONGO TI MO OKO SO KOUE ; MO S...kPANDALE TI BONGO T I MO OKO SO KOUEMO SO MO YEKE NA GNE
9FAUSSEUR ; HOMO SEXUEL ; BARBARIE ;FAUSSEURHOMO SEXUELBARBARIE
10CONSIDERER LES MUSULMANS COMME LES TERRORISTESIDIOTBANDAYENaN
11BRAQUEUR ; DESORDONNE ; FOU ;BRAQUEURDESORDONNENaN
12EGALITE ENTRE LES SEXSES ; GUERRE ; NATIONNALI...EGALITE ENTRE LES SEXESDROIT DE L'ENFANTNaN
13CRFISE DE COVID 19 ; CRISE ECONOMIQUE ;GROUPE DE BANDITSILLETRENaN
14Balaka, Seleka, a baba soBalakaSelekaTi ala a baba so
15Gagango, arabou, soukoula biGagangoArabouSoukoula mbi
16SARANGA ; BORDELSARANGANaNNaN
17A GA GANGO ; ALA GA LAWA ; MO NI SO KOUE LAA GA GANGOALA GA LAWANaN
18ESCROC ; IDIOT ; I MOU MOESCROCNaNNaN
19L'insulte,la division, le racisme, l'ethnocent...IdiotBon à rien!Âne
\n", + "
" + ], + "text/plain": [ + " terms_list ... term_3\n", + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... VOYOUX\n", + "1 rien ... NaN\n", + "2 HAINE ; RELIGION ; ETHNIQUE ; ... NaN\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ? ... NaN\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES MANDJA SONT DES GRANDS VOLEURS ;\n", + "5 MO KPA GUI A ALBUNOS ; MO GUI MBI FURU NA MO ... ... NaN\n", + "6 RELIGION ; ETHNIQUE , FOOT-BALL ... NaN\n", + "7 VOYOUTISME ; IMPURE ; MECREANT ; ... NaN\n", + "8 KPANDA ; LE TI BONGO TI MO OKO SO KOUE ; MO S... ... MO SO MO YEKE NA GNE\n", + "9 FAUSSEUR ; HOMO SEXUEL ; BARBARIE ; ... BARBARIE\n", + "10 CONSIDERER LES MUSULMANS COMME LES TERRORISTES ... NaN\n", + "11 BRAQUEUR ; DESORDONNE ; FOU ; ... NaN\n", + "12 EGALITE ENTRE LES SEXSES ; GUERRE ; NATIONNALI... ... NaN\n", + "13 CRFISE DE COVID 19 ; CRISE ECONOMIQUE ; ... NaN\n", + "14 Balaka, Seleka, a baba so ... Ti ala a baba so\n", + "15 Gagango, arabou, soukoula bi ... Soukoula mbi\n", + "16 SARANGA ; BORDEL ... NaN\n", + "17 A GA GANGO ; ALA GA LAWA ; MO NI SO KOUE LA ... NaN\n", + "18 ESCROC ; IDIOT ; I MOU MO ... NaN\n", + "19 L'insulte,la division, le racisme, l'ethnocent... ... Âne\n", + "\n", + "[20 rows x 4 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xrvxLq8naX8x", + "outputId": "c3136ea0-625d-4831-9e09-b3c83b00a35d" + }, + "source": [ + "terms['term_1'].value_counts()" + ], + "execution_count": 19, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "ETRANGER 11\n", + "ETHNIQUE 6\n", + "A MBO TI TOUADERA 6\n", + "MBORORO 5\n", + "RELIGION 4\n", + " ..\n", + "FOUNGO TERE 1\n", + "GA GA NGON 1\n", + "ALA A BABA SO LA ? ; 1\n", + "JE SUIS FACA ; 1\n", + "GROUPE DE BANDITS 1\n", + "Name: term_1, Length: 403, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "buyvQpOJh322" + }, + "source": [ + "from sklearn import preprocessing" + ], + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jSdlaj2L1DY1" + }, + "source": [ + "section2 = terms['terms_list']" + ], + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aX3cBB_51S3_", + "outputId": "f30015d2-c884-4330-9734-08f1393d92ef" + }, + "source": [ + "section2" + ], + "execution_count": 25, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,\n", + "1 rien\n", + "2 HAINE ; RELIGION ; ETHNIQUE ;\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ?\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...\n", + " ... \n", + "477 chamarocka, kholo, nda kangue\n", + "478 Amdjoudoul iwacki koudjoumass\n", + "479 Bengue, arabo, pro français, pro russes, moutons,\n", + "480 Gangster\n", + "481 - Lawa Lawa\\n- Benguè\\n- Bandaï\n", + "Name: terms_list, Length: 482, dtype: object" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 25 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ot60Lmew6qp0", + "outputId": "3735a9c3-5c4e-4489-caf0-f8ab54c950b2" + }, + "source": [ + "!pip install contractions\n", + "import contractions" + ], + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting contractions\n", + " Downloading https://files.pythonhosted.org/packages/ce/ad/d1c685967945a04f8596128b15a1ab56c51488f53312e953341af6ff22d1/contractions-0.0.43-py2.py3-none-any.whl\n", + "Collecting textsearch\n", + " Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl\n", + "Collecting pyahocorasick\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)\n", + "\u001b[K |████████████████████████████████| 317kB 5.9MB/s \n", + "\u001b[?25hCollecting Unidecode\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)\n", + "\u001b[K |████████████████████████████████| 245kB 41.9MB/s \n", + "\u001b[?25hBuilding wheels for collected packages: pyahocorasick\n", + " Building wheel for pyahocorasick (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pyahocorasick: filename=pyahocorasick-1.4.0-cp36-cp36m-linux_x86_64.whl size=81707 sha256=0cb88880bcc215b7a3749858fd619d028c638f276938a7ffe08d22897d33c4d8\n", + " Stored in directory: /root/.cache/pip/wheels/0a/90/61/87a55f5b459792fbb2b7ba6b31721b06ff5cf6bde541b40994\n", + "Successfully built pyahocorasick\n", + "Installing collected packages: pyahocorasick, Unidecode, textsearch, contractions\n", + "Successfully installed Unidecode-1.1.1 contractions-0.0.43 pyahocorasick-1.4.0 textsearch-0.0.17\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yFq0P9PHFMFr" + }, + "source": [ + "terms['no_contract'] = section2.apply(lambda x: [contractions.fix(word) for word in x.split()])" + ], + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 276 + }, + "id": "v6Q_V3rWFoNA", + "outputId": "0d3d3388-87ad-4d37-fd94-30a9137b94e4" + }, + "source": [ + "terms.head()" + ], + "execution_count": 28, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
terms_listterm_1term_2term_3no_contract
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]
1rienrienNaNNaN[rien]
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...
\n", + "
" + ], + "text/plain": [ + " terms_list ... no_contract\n", + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]\n", + "1 rien ... [rien]\n", + "2 HAINE ; RELIGION ; ETHNIQUE ; ... [HAINE, ;, RELIGION, ;, ETHNIQUE, ;]\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ? ... [TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...\n", + "\n", + "[5 rows x 5 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 28 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 293 + }, + "id": "yCUL-zsLQ4-O", + "outputId": "0dbc0192-3ead-40d9-f81d-00d575672d0a" + }, + "source": [ + "terms[\"msg_str\"] = [' '.join(map(str, l)) for l in terms['no_contract']]\n", + "terms.head()" + ], + "execution_count": 36, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
terms_listterm_1term_2term_3no_contractmsg_str
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,
1rienrienNaNNaN[rien]rien
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]HAINE ; RELIGION ; ETHNIQUE ;
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]TETUE ; VOYOU ; MO YINGA MBI ?
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...
\n", + "
" + ], + "text/plain": [ + " terms_list ... msg_str\n", + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,\n", + "1 rien ... rien\n", + "2 HAINE ; RELIGION ; ETHNIQUE ; ... HAINE ; RELIGION ; ETHNIQUE ;\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ? ... TETUE ; VOYOU ; MO YINGA MBI ?\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...\n", + "\n", + "[5 rows x 6 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 36 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cEwARPq_GG08", + "outputId": "f568dc5f-a4f2-44c4-9964-89c0e712fb8e" + }, + "source": [ + "import nltk\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import word_tokenize" + ], + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "18uESl1iP1RL" + }, + "source": [ + "text = \"Hi, I would like to tokenize this sentence\"" + ], + "execution_count": 31, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MF66YtCuP5YZ", + "outputId": "72c38ef0-513a-435a-8950-80de388ea66c" + }, + "source": [ + "print(word_tokenize(text))" + ], + "execution_count": 32, + "outputs": [ + { + "output_type": "stream", + "text": [ + "['Hi', ',', 'I', 'would', 'like', 'to', 'tokenize', 'this', 'sentence']\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "tAsC9yEvQNNt" + }, + "source": [ + "terms['tokenized'] = terms['msg_str'].apply(word_tokenize)" + ], + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 328 + }, + "id": "YbkjpCyiRRNt", + "outputId": "782297b9-595b-4aba-bb89-ceea73fdc3ff" + }, + "source": [ + "terms.head()" + ], + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
terms_listterm_1term_2term_3no_contractmsg_strtokenized
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...
1rienrienNaNNaN[rien]rien[rien]
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]HAINE ; RELIGION ; ETHNIQUE ;[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]TETUE ; VOYOU ; MO YINGA MBI ?[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...
\n", + "
" + ], + "text/plain": [ + " terms_list ... tokenized\n", + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...\n", + "1 rien ... [rien]\n", + "2 HAINE ; RELIGION ; ETHNIQUE ; ... [HAINE, ;, RELIGION, ;, ETHNIQUE, ;]\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ? ... [TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...\n", + "\n", + "[5 rows x 7 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 380 + }, + "id": "_w9FQIj9SrTG", + "outputId": "dfef973c-bfcb-4c76-d891-c91e4c0c4bd7" + }, + "source": [ + "terms['lower'] = terms['tokenized'].apply(lambda x: [word.lower() for word in x])\n", + "terms.head()" + ], + "execution_count": 41, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
terms_listterm_1term_2term_3no_contractmsg_strtokenizedlower
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...[saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...
1rienrienNaNNaN[rien]rien[rien][rien]
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]HAINE ; RELIGION ; ETHNIQUE ;[HAINE, ;, RELIGION, ;, ETHNIQUE, ;][haine, ;, religion, ;, ethnique, ;]
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]TETUE ; VOYOU ; MO YINGA MBI ?[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?][tetue, ;, voyou, ;, mo, yinga, mbi, ?]
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...[les, gbakas, mandja, sont, trop, egoistes, ;,...
\n", + "
" + ], + "text/plain": [ + " terms_list ... lower\n", + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...\n", + "1 rien ... [rien]\n", + "2 HAINE ; RELIGION ; ETHNIQUE ; ... [haine, ;, religion, ;, ethnique, ;]\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ? ... [tetue, ;, voyou, ;, mo, yinga, mbi, ?]\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [les, gbakas, mandja, sont, trop, egoistes, ;,...\n", + "\n", + "[5 rows x 8 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 41 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 380 + }, + "id": "GXCYOG-XTYJy", + "outputId": "5936953c-3c9e-49a9-dcda-3418e1ce939c" + }, + "source": [ + "import string\n", + "punc = string.punctuation\n", + "terms['no_punc'] = terms['lower'].apply(lambda x: [word for word in x if word not in punc])\n", + "terms.head()" + ], + "execution_count": 42, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
terms_listterm_1term_2term_3no_contractmsg_strtokenizedlowerno_punc
0SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,SARANGA TI WALITAXI-MOTOVOYOUX[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ;,]SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;,[SARANGA, TI, WALI, ;, TAXI, -MOTO, ;, VOYOU, ...[saranga, ti, wali, ;, taxi, -moto, ;, voyou, ...[saranga, ti, wali, taxi, -moto, voyou]
1rienrienNaNNaN[rien]rien[rien][rien][rien]
2HAINE ; RELIGION ; ETHNIQUE ;HAINENaNNaN[HAINE, ;, RELIGION, ;, ETHNIQUE, ;]HAINE ; RELIGION ; ETHNIQUE ;[HAINE, ;, RELIGION, ;, ETHNIQUE, ;][haine, ;, religion, ;, ethnique, ;][haine, religion, ethnique]
3TETUE ; VOYOU ; MO YINGA MBI ?TETUENaNNaN[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?]TETUE ; VOYOU ; MO YINGA MBI ?[TETUE, ;, VOYOU, ;, MO, YINGA, MBI, ?][tetue, ;, voyou, ;, mo, yinga, mbi, ?][tetue, voyou, mo, yinga, mbi]
4LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...LES GBAKAS MANDJA SONT TROP EGOISTESLES YAKOMAS SONT DES ORGUEILLEUXLES MANDJA SONT DES GRANDS VOLEURS ;[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES;, L...LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO...[LES, GBAKAS, MANDJA, SONT, TROP, EGOISTES, ;,...[les, gbakas, mandja, sont, trop, egoistes, ;,...[les, gbakas, mandja, sont, trop, egoistes, le...
\n", + "
" + ], + "text/plain": [ + " terms_list ... no_punc\n", + "0 SARANGA TI WALI ; TAXI -MOTO ; VOYOU ;, ... [saranga, ti, wali, taxi, -moto, voyou]\n", + "1 rien ... [rien]\n", + "2 HAINE ; RELIGION ; ETHNIQUE ; ... [haine, religion, ethnique]\n", + "3 TETUE ; VOYOU ; MO YINGA MBI ? ... [tetue, voyou, mo, yinga, mbi]\n", + "4 LES GBAKAS MANDJA SONT TROP EGOISTES; LES YAKO... ... [les, gbakas, mandja, sont, trop, egoistes, le...\n", + "\n", + "[5 rows x 9 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 42 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UYk5KcIZUJAZ", + "outputId": "85b4298c-44e8-4714-a51b-fae2b2afecaf" + }, + "source": [ + "terms.terms_list.str.split(expand=True).stack().value_counts()[:50]" + ], + "execution_count": 55, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "; 943\n", + "TI 274\n", + "MO 188\n", + "A 174\n", + "ZO 102\n", + "SO 88\n", + "ALA 78\n", + "LA 77\n", + "LO 69\n", + "BA 49\n", + "NA 46\n", + "GA 44\n", + "? 44\n", + "MBI 38\n", + "YEKE 35\n", + ", 34\n", + "BOUBA 33\n", + "LES 31\n", + "WALI 29\n", + "APE 28\n", + "TU 28\n", + "I 26\n", + "BENGUE 26\n", + "MAMA 24\n", + "AWE 22\n", + "LAWA 20\n", + "DE 19\n", + "ARABO 19\n", + "PINDOUNGOU 18\n", + "DES 18\n", + "RELIGION 17\n", + "TOUADERA 17\n", + "MBORORO 17\n", + "ETHNIQUE 16\n", + "GANGO 16\n", + "ETRANGER 16\n", + "YA 15\n", + "LE 15\n", + "KATA 15\n", + "MBO 14\n", + "GBAYA 14\n", + "ME 13\n", + "ES 13\n", + "BANDA 13\n", + "TA 13\n", + "POLITIQUE 13\n", + "INGA 13\n", + "KE 13\n", + "SELEKA 13\n", + "ANDE 12\n", + "dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 55 + } + ] + } + ] +} \ No newline at end of file