From 30a707884a5c35103357c27683563301baee8228 Mon Sep 17 00:00:00 2001
From: migtoqu <miguel.toquero@alumnos.uva.es>
Date: Thu, 15 Jul 2021 08:11:27 +0000
Subject: [PATCH] Subir nuevo archivo

---
 .../Codigo/Gestion_de_directorios.ipynb       | 455 ++++++++++++++++++
 1 file changed, 455 insertions(+)
 create mode 100644 Aplicacion/Codigo/Gestion_de_directorios.ipynb

diff --git a/Aplicacion/Codigo/Gestion_de_directorios.ipynb b/Aplicacion/Codigo/Gestion_de_directorios.ipynb
new file mode 100644
index 0000000..530722d
--- /dev/null
+++ b/Aplicacion/Codigo/Gestion_de_directorios.ipynb
@@ -0,0 +1,455 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Importamos modulos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, shutil\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lectura de datos (metadatos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>FILE NAME</th>\n",
+       "      <th>CLASE</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>COVID-19(1)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>COVID-19(2)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>COVID-19(3)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>COVID-19(4)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>COVID-19(5)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     FILE NAME     CLASE\n",
+       "0  COVID-19(1)  COVID-19\n",
+       "1  COVID-19(2)  COVID-19\n",
+       "2  COVID-19(3)  COVID-19\n",
+       "3  COVID-19(4)  COVID-19\n",
+       "4  COVID-19(5)  COVID-19"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "metadata = pd.read_csv('../datos/Metadata/metadatos.csv')\n",
+    "metadata.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Separamos en entrenamiento y prueba"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train, test = train_test_split(metadata, test_size=1/3, stratify=metadata.CLASE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Observamos las mismas proporciones en todos los subconjuntos"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Creamos en el directorio datos los directorios train y test donde almacenaremos las imagenes correspondientes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Intentamos obtener los nombres reales de las imagenes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mypath = \"../datos/COVID-19\"\n",
+    "covid_files = [f for f in os.listdir(mypath)]\n",
+    "#covid_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mypath = \"../datos/NORMAL\"\n",
+    "normal_files = [f for f in os.listdir(mypath)]\n",
+    "#normal_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mypath = \"../datos/Viral Pneumonia\"\n",
+    "pneumonia_files = [f for f in os.listdir(mypath)]\n",
+    "#pneumonia_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files= normal_files+pneumonia_files+covid_files\n",
+    "#files.append(pneumonia_files)\n",
+    "#files.append(covid_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clase_n = ['NORMAL' for i in range(0,len(normal_files))]\n",
+    "clase_p = ['Viral Pneumonia' for i in range(0,len(pneumonia_files))]\n",
+    "clase_c = ['COVID-19' for i in range(0,len(covid_files))]\n",
+    "\n",
+    "clase = clase_n+clase_p+clase_c"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>FILENAME</th>\n",
+       "      <th>CLASS</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>NORMAL (1).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>NORMAL (10).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>NORMAL (100).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>NORMAL (101).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>NORMAL (102).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2900</th>\n",
+       "      <td>COVID-19(215).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2901</th>\n",
+       "      <td>COVID-19(216).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2902</th>\n",
+       "      <td>COVID-19(217).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2903</th>\n",
+       "      <td>COVID-19(218).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2904</th>\n",
+       "      <td>COVID-19(219).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2905 rows Ã— 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               FILENAME     CLASS\n",
+       "0        NORMAL (1).png    NORMAL\n",
+       "1       NORMAL (10).png    NORMAL\n",
+       "2      NORMAL (100).png    NORMAL\n",
+       "3      NORMAL (101).png    NORMAL\n",
+       "4      NORMAL (102).png    NORMAL\n",
+       "...                 ...       ...\n",
+       "2900  COVID-19(215).png  COVID-19\n",
+       "2901  COVID-19(216).png  COVID-19\n",
+       "2902  COVID-19(217).png  COVID-19\n",
+       "2903  COVID-19(218).png  COVID-19\n",
+       "2904  COVID-19(219).png  COVID-19\n",
+       "\n",
+       "[2905 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame(list(zip(files, clase)), \n",
+    "               columns =['FILENAME', 'CLASS']) \n",
+    "df "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Separacion en entrenamiento y prueba"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train, test = train_test_split(df, test_size=1/3, stratify=df.CLASS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "METADATA:\n",
+      "Total frecuencies: \n",
+      " Viral Pneumonia    1345\n",
+      "NORMAL             1341\n",
+      "COVID-19            219\n",
+      "Name: CLASS, dtype: int64\n",
+      "Proportion: \n",
+      " Viral Pneumonia    0.462995\n",
+      "NORMAL             0.461618\n",
+      "COVID-19           0.075387\n",
+      "Name: CLASS, dtype: float64\n",
+      "----------------------------------------------\n",
+      "TRAIN:\n",
+      "Total frecuencies: \n",
+      " Viral Pneumonia    896\n",
+      "NORMAL             894\n",
+      "COVID-19           146\n",
+      "Name: CLASS, dtype: int64\n",
+      "Proportion: \n",
+      " Viral Pneumonia    0.462810\n",
+      "NORMAL             0.461777\n",
+      "COVID-19           0.075413\n",
+      "Name: CLASS, dtype: float64\n",
+      "----------------------------------------------\n",
+      "TEST:\n",
+      "Total frecuencies: \n",
+      " Viral Pneumonia    449\n",
+      "NORMAL             447\n",
+      "COVID-19            73\n",
+      "Name: CLASS, dtype: int64\n",
+      "Proportion: \n",
+      " Viral Pneumonia    0.463364\n",
+      "NORMAL             0.461300\n",
+      "COVID-19           0.075335\n",
+      "Name: CLASS, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"METADATA:\")\n",
+    "print(\"Total frecuencies: \\n\",df.CLASS.value_counts())\n",
+    "print(\"Proportion: \\n\",df.CLASS.value_counts()/df.shape[0])\n",
+    "print(\"----------------------------------------------\")\n",
+    "print(\"TRAIN:\")\n",
+    "print(\"Total frecuencies: \\n\",train.CLASS.value_counts())\n",
+    "print(\"Proportion: \\n\",train.CLASS.value_counts()/train.shape[0])\n",
+    "print(\"----------------------------------------------\")\n",
+    "print(\"TEST:\")\n",
+    "print(\"Total frecuencies: \\n\",test.CLASS.value_counts())\n",
+    "print(\"Proportion: \\n\",test.CLASS.value_counts()/test.shape[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Comprobamos que las proporciones se mantienen, hemos realizado un muestreo balanceado."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(train.shape[0]):\n",
+    "    #train.iloc[i,1] = CLASE\n",
+    "    #train.iloc[i,0] = FILE NAME\n",
+    "    src = os.path.join(\"../datos\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n",
+    "    dst = os.path.join(\"../datos/train\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n",
+    "    shutil.copyfile(src,dst)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(test.shape[0]):\n",
+    "    #train.iloc[i,1] = CLASE\n",
+    "    #train.iloc[i,0] = FILE NAME\n",
+    "    src = os.path.join(\"../datos\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n",
+    "    dst = os.path.join(\"../datos/test\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n",
+    "    shutil.copyfile(src,dst)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
-- 
GitLab