{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Importamos modulos" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import os, shutil\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lectura de datos (metadatos)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>FILE NAME</th>\n", " <th>CLASE</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>COVID-19(1)</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>COVID-19(2)</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>COVID-19(3)</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>COVID-19(4)</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>COVID-19(5)</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " FILE NAME CLASE\n", "0 COVID-19(1) COVID-19\n", "1 COVID-19(2) COVID-19\n", "2 COVID-19(3) COVID-19\n", "3 COVID-19(4) COVID-19\n", "4 COVID-19(5) COVID-19" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata = pd.read_csv('../datos/Metadata/metadatos.csv')\n", "metadata.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Separamos en entrenamiento y prueba" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "train, test = train_test_split(metadata, test_size=1/3, stratify=metadata.CLASE)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Observamos las mismas proporciones en todos los subconjuntos" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Creamos en el directorio datos los directorios train y test donde almacenaremos las imagenes correspondientes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Intentamos obtener los nombres reales de las imagenes" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "mypath = \"../datos/COVID-19\"\n", "covid_files = [f for f in os.listdir(mypath)]\n", "#covid_files" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "mypath = \"../datos/NORMAL\"\n", "normal_files = [f for f in os.listdir(mypath)]\n", "#normal_files" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "mypath = \"../datos/Viral Pneumonia\"\n", "pneumonia_files = [f for f in os.listdir(mypath)]\n", "#pneumonia_files" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "files= normal_files+pneumonia_files+covid_files\n", "#files.append(pneumonia_files)\n", "#files.append(covid_files)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "clase_n = ['NORMAL' for i in range(0,len(normal_files))]\n", "clase_p = ['Viral Pneumonia' for i in range(0,len(pneumonia_files))]\n", "clase_c = ['COVID-19' for i in range(0,len(covid_files))]\n", "\n", "clase = clase_n+clase_p+clase_c" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>FILENAME</th>\n", " <th>CLASS</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>NORMAL (1).png</td>\n", " <td>NORMAL</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>NORMAL (10).png</td>\n", " <td>NORMAL</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>NORMAL (100).png</td>\n", " <td>NORMAL</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>NORMAL (101).png</td>\n", " <td>NORMAL</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>NORMAL (102).png</td>\n", " <td>NORMAL</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>2900</th>\n", " <td>COVID-19(215).png</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " <tr>\n", " <th>2901</th>\n", " <td>COVID-19(216).png</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " <tr>\n", " <th>2902</th>\n", " <td>COVID-19(217).png</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " <tr>\n", " <th>2903</th>\n", " <td>COVID-19(218).png</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " <tr>\n", " <th>2904</th>\n", " <td>COVID-19(219).png</td>\n", " <td>COVID-19</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>2905 rows × 2 columns</p>\n", "</div>" ], "text/plain": [ " FILENAME CLASS\n", "0 NORMAL (1).png NORMAL\n", "1 NORMAL (10).png NORMAL\n", "2 NORMAL (100).png NORMAL\n", "3 NORMAL (101).png NORMAL\n", "4 NORMAL (102).png NORMAL\n", "... ... ...\n", "2900 COVID-19(215).png COVID-19\n", "2901 COVID-19(216).png COVID-19\n", "2902 COVID-19(217).png COVID-19\n", "2903 COVID-19(218).png COVID-19\n", "2904 COVID-19(219).png COVID-19\n", "\n", "[2905 rows x 2 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(list(zip(files, clase)), \n", " columns =['FILENAME', 'CLASS']) \n", "df " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Separacion en entrenamiento y prueba" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "train, test = train_test_split(df, test_size=1/3, stratify=df.CLASS)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "METADATA:\n", "Total frecuencies: \n", " Viral Pneumonia 1345\n", "NORMAL 1341\n", "COVID-19 219\n", "Name: CLASS, dtype: int64\n", "Proportion: \n", " Viral Pneumonia 0.462995\n", "NORMAL 0.461618\n", "COVID-19 0.075387\n", "Name: CLASS, dtype: float64\n", "----------------------------------------------\n", "TRAIN:\n", "Total frecuencies: \n", " Viral Pneumonia 896\n", "NORMAL 894\n", "COVID-19 146\n", "Name: CLASS, dtype: int64\n", "Proportion: \n", " Viral Pneumonia 0.462810\n", "NORMAL 0.461777\n", "COVID-19 0.075413\n", "Name: CLASS, dtype: float64\n", "----------------------------------------------\n", "TEST:\n", "Total frecuencies: \n", " Viral Pneumonia 449\n", "NORMAL 447\n", "COVID-19 73\n", "Name: CLASS, dtype: int64\n", "Proportion: \n", " Viral Pneumonia 0.463364\n", "NORMAL 0.461300\n", "COVID-19 0.075335\n", "Name: CLASS, dtype: float64\n" ] } ], "source": [ "print(\"METADATA:\")\n", "print(\"Total frecuencies: \\n\",df.CLASS.value_counts())\n", "print(\"Proportion: \\n\",df.CLASS.value_counts()/df.shape[0])\n", "print(\"----------------------------------------------\")\n", "print(\"TRAIN:\")\n", "print(\"Total frecuencies: \\n\",train.CLASS.value_counts())\n", "print(\"Proportion: \\n\",train.CLASS.value_counts()/train.shape[0])\n", "print(\"----------------------------------------------\")\n", "print(\"TEST:\")\n", "print(\"Total frecuencies: \\n\",test.CLASS.value_counts())\n", "print(\"Proportion: \\n\",test.CLASS.value_counts()/test.shape[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Comprobamos que las proporciones se mantienen, hemos realizado un muestreo balanceado." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "for i in range(train.shape[0]):\n", " #train.iloc[i,1] = CLASE\n", " #train.iloc[i,0] = FILE NAME\n", " src = os.path.join(\"../datos\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n", " dst = os.path.join(\"../datos/train\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n", " shutil.copyfile(src,dst)\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "for i in range(test.shape[0]):\n", " #train.iloc[i,1] = CLASE\n", " #train.iloc[i,0] = FILE NAME\n", " src = os.path.join(\"../datos\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n", " dst = os.path.join(\"../datos/test\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n", " shutil.copyfile(src,dst)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }