From 30a707884a5c35103357c27683563301baee8228 Mon Sep 17 00:00:00 2001 From: migtoqu <miguel.toquero@alumnos.uva.es> Date: Thu, 15 Jul 2021 08:11:27 +0000 Subject: [PATCH] Subir nuevo archivo --- .../Codigo/Gestion_de_directorios.ipynb | 455 ++++++++++++++++++ 1 file changed, 455 insertions(+) create mode 100644 Aplicacion/Codigo/Gestion_de_directorios.ipynb diff --git a/Aplicacion/Codigo/Gestion_de_directorios.ipynb b/Aplicacion/Codigo/Gestion_de_directorios.ipynb new file mode 100644 index 0000000..530722d --- /dev/null +++ b/Aplicacion/Codigo/Gestion_de_directorios.ipynb @@ -0,0 +1,455 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Importamos modulos" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import os, shutil\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lectura de datos (metadatos)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>FILE NAME</th>\n", + " <th>CLASE</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>COVID-19(1)</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>COVID-19(2)</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>COVID-19(3)</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>COVID-19(4)</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>COVID-19(5)</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " FILE NAME CLASE\n", + "0 COVID-19(1) COVID-19\n", + "1 COVID-19(2) COVID-19\n", + "2 COVID-19(3) COVID-19\n", + "3 COVID-19(4) COVID-19\n", + "4 COVID-19(5) COVID-19" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metadata = pd.read_csv('../datos/Metadata/metadatos.csv')\n", + "metadata.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Separamos en entrenamiento y prueba" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = train_test_split(metadata, test_size=1/3, stratify=metadata.CLASE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Observamos las mismas proporciones en todos los subconjuntos" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creamos en el directorio datos los directorios train y test donde almacenaremos las imagenes correspondientes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Intentamos obtener los nombres reales de las imagenes" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "mypath = \"../datos/COVID-19\"\n", + "covid_files = [f for f in os.listdir(mypath)]\n", + "#covid_files" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "mypath = \"../datos/NORMAL\"\n", + "normal_files = [f for f in os.listdir(mypath)]\n", + "#normal_files" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "mypath = \"../datos/Viral Pneumonia\"\n", + "pneumonia_files = [f for f in os.listdir(mypath)]\n", + "#pneumonia_files" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "files= normal_files+pneumonia_files+covid_files\n", + "#files.append(pneumonia_files)\n", + "#files.append(covid_files)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "clase_n = ['NORMAL' for i in range(0,len(normal_files))]\n", + "clase_p = ['Viral Pneumonia' for i in range(0,len(pneumonia_files))]\n", + "clase_c = ['COVID-19' for i in range(0,len(covid_files))]\n", + "\n", + "clase = clase_n+clase_p+clase_c" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>FILENAME</th>\n", + " <th>CLASS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>NORMAL (1).png</td>\n", + " <td>NORMAL</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>NORMAL (10).png</td>\n", + " <td>NORMAL</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>NORMAL (100).png</td>\n", + " <td>NORMAL</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>NORMAL (101).png</td>\n", + " <td>NORMAL</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>NORMAL (102).png</td>\n", + " <td>NORMAL</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2900</th>\n", + " <td>COVID-19(215).png</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2901</th>\n", + " <td>COVID-19(216).png</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2902</th>\n", + " <td>COVID-19(217).png</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2903</th>\n", + " <td>COVID-19(218).png</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2904</th>\n", + " <td>COVID-19(219).png</td>\n", + " <td>COVID-19</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>2905 rows × 2 columns</p>\n", + "</div>" + ], + "text/plain": [ + " FILENAME CLASS\n", + "0 NORMAL (1).png NORMAL\n", + "1 NORMAL (10).png NORMAL\n", + "2 NORMAL (100).png NORMAL\n", + "3 NORMAL (101).png NORMAL\n", + "4 NORMAL (102).png NORMAL\n", + "... ... ...\n", + "2900 COVID-19(215).png COVID-19\n", + "2901 COVID-19(216).png COVID-19\n", + "2902 COVID-19(217).png COVID-19\n", + "2903 COVID-19(218).png COVID-19\n", + "2904 COVID-19(219).png COVID-19\n", + "\n", + "[2905 rows x 2 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(list(zip(files, clase)), \n", + " columns =['FILENAME', 'CLASS']) \n", + "df " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Separacion en entrenamiento y prueba" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = train_test_split(df, test_size=1/3, stratify=df.CLASS)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "METADATA:\n", + "Total frecuencies: \n", + " Viral Pneumonia 1345\n", + "NORMAL 1341\n", + "COVID-19 219\n", + "Name: CLASS, dtype: int64\n", + "Proportion: \n", + " Viral Pneumonia 0.462995\n", + "NORMAL 0.461618\n", + "COVID-19 0.075387\n", + "Name: CLASS, dtype: float64\n", + "----------------------------------------------\n", + "TRAIN:\n", + "Total frecuencies: \n", + " Viral Pneumonia 896\n", + "NORMAL 894\n", + "COVID-19 146\n", + "Name: CLASS, dtype: int64\n", + "Proportion: \n", + " Viral Pneumonia 0.462810\n", + "NORMAL 0.461777\n", + "COVID-19 0.075413\n", + "Name: CLASS, dtype: float64\n", + "----------------------------------------------\n", + "TEST:\n", + "Total frecuencies: \n", + " Viral Pneumonia 449\n", + "NORMAL 447\n", + "COVID-19 73\n", + "Name: CLASS, dtype: int64\n", + "Proportion: \n", + " Viral Pneumonia 0.463364\n", + "NORMAL 0.461300\n", + "COVID-19 0.075335\n", + "Name: CLASS, dtype: float64\n" + ] + } + ], + "source": [ + "print(\"METADATA:\")\n", + "print(\"Total frecuencies: \\n\",df.CLASS.value_counts())\n", + "print(\"Proportion: \\n\",df.CLASS.value_counts()/df.shape[0])\n", + "print(\"----------------------------------------------\")\n", + "print(\"TRAIN:\")\n", + "print(\"Total frecuencies: \\n\",train.CLASS.value_counts())\n", + "print(\"Proportion: \\n\",train.CLASS.value_counts()/train.shape[0])\n", + "print(\"----------------------------------------------\")\n", + "print(\"TEST:\")\n", + "print(\"Total frecuencies: \\n\",test.CLASS.value_counts())\n", + "print(\"Proportion: \\n\",test.CLASS.value_counts()/test.shape[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Comprobamos que las proporciones se mantienen, hemos realizado un muestreo balanceado." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(train.shape[0]):\n", + " #train.iloc[i,1] = CLASE\n", + " #train.iloc[i,0] = FILE NAME\n", + " src = os.path.join(\"../datos\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n", + " dst = os.path.join(\"../datos/train\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n", + " shutil.copyfile(src,dst)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(test.shape[0]):\n", + " #train.iloc[i,1] = CLASE\n", + " #train.iloc[i,0] = FILE NAME\n", + " src = os.path.join(\"../datos\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n", + " dst = os.path.join(\"../datos/test\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n", + " shutil.copyfile(src,dst)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab