Subir nuevo archivo

6274c024 · migtoqu · 53608935 · 6274c024
Commit 6274c024 authored 3 years ago by migtoqu
--- a/Código/Gestion_de_directorios.ipynb
+++ b/Código/Gestion_de_directorios.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Importamos modulos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, shutil\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lectura de datos (metadatos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>FILE NAME</th>\n",
+       "      <th>CLASE</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>COVID-19(1)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>COVID-19(2)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>COVID-19(3)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>COVID-19(4)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>COVID-19(5)</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     FILE NAME     CLASE\n",
+       "0  COVID-19(1)  COVID-19\n",
+       "1  COVID-19(2)  COVID-19\n",
+       "2  COVID-19(3)  COVID-19\n",
+       "3  COVID-19(4)  COVID-19\n",
+       "4  COVID-19(5)  COVID-19"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "metadata = pd.read_csv('../datos/Metadata/metadatos.csv')\n",
+    "metadata.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Separamos en entrenamiento y prueba"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train, test = train_test_split(metadata, test_size=1/3, stratify=metadata.CLASE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Observamos las mismas proporciones en todos los subconjuntos"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Creamos en el directorio datos los directorios train y test donde almacenaremos las imagenes correspondientes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Intentamos obtener los nombres reales de las imagenes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mypath = \"../datos/COVID-19\"\n",
+    "covid_files = [f for f in os.listdir(mypath)]\n",
+    "#covid_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mypath = \"../datos/NORMAL\"\n",
+    "normal_files = [f for f in os.listdir(mypath)]\n",
+    "#normal_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mypath = \"../datos/Viral Pneumonia\"\n",
+    "pneumonia_files = [f for f in os.listdir(mypath)]\n",
+    "#pneumonia_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files= normal_files+pneumonia_files+covid_files\n",
+    "#files.append(pneumonia_files)\n",
+    "#files.append(covid_files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clase_n = ['NORMAL' for i in range(0,len(normal_files))]\n",
+    "clase_p = ['Viral Pneumonia' for i in range(0,len(pneumonia_files))]\n",
+    "clase_c = ['COVID-19' for i in range(0,len(covid_files))]\n",
+    "\n",
+    "clase = clase_n+clase_p+clase_c"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>FILENAME</th>\n",
+       "      <th>CLASS</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>NORMAL (1).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>NORMAL (10).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>NORMAL (100).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>NORMAL (101).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>NORMAL (102).png</td>\n",
+       "      <td>NORMAL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2900</th>\n",
+       "      <td>COVID-19(215).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2901</th>\n",
+       "      <td>COVID-19(216).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2902</th>\n",
+       "      <td>COVID-19(217).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2903</th>\n",
+       "      <td>COVID-19(218).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2904</th>\n",
+       "      <td>COVID-19(219).png</td>\n",
+       "      <td>COVID-19</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2905 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               FILENAME     CLASS\n",
+       "0        NORMAL (1).png    NORMAL\n",
+       "1       NORMAL (10).png    NORMAL\n",
+       "2      NORMAL (100).png    NORMAL\n",
+       "3      NORMAL (101).png    NORMAL\n",
+       "4      NORMAL (102).png    NORMAL\n",
+       "...                 ...       ...\n",
+       "2900  COVID-19(215).png  COVID-19\n",
+       "2901  COVID-19(216).png  COVID-19\n",
+       "2902  COVID-19(217).png  COVID-19\n",
+       "2903  COVID-19(218).png  COVID-19\n",
+       "2904  COVID-19(219).png  COVID-19\n",
+       "\n",
+       "[2905 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame(list(zip(files, clase)), \n",
+    "               columns =['FILENAME', 'CLASS']) \n",
+    "df "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Separacion en entrenamiento y prueba"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train, test = train_test_split(df, test_size=1/3, stratify=df.CLASS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "METADATA:\n",
+      "Total frecuencies: \n",
+      " Viral Pneumonia    1345\n",
+      "NORMAL             1341\n",
+      "COVID-19            219\n",
+      "Name: CLASS, dtype: int64\n",
+      "Proportion: \n",
+      " Viral Pneumonia    0.462995\n",
+      "NORMAL             0.461618\n",
+      "COVID-19           0.075387\n",
+      "Name: CLASS, dtype: float64\n",
+      "----------------------------------------------\n",
+      "TRAIN:\n",
+      "Total frecuencies: \n",
+      " Viral Pneumonia    896\n",
+      "NORMAL             894\n",
+      "COVID-19           146\n",
+      "Name: CLASS, dtype: int64\n",
+      "Proportion: \n",
+      " Viral Pneumonia    0.462810\n",
+      "NORMAL             0.461777\n",
+      "COVID-19           0.075413\n",
+      "Name: CLASS, dtype: float64\n",
+      "----------------------------------------------\n",
+      "TEST:\n",
+      "Total frecuencies: \n",
+      " Viral Pneumonia    449\n",
+      "NORMAL             447\n",
+      "COVID-19            73\n",
+      "Name: CLASS, dtype: int64\n",
+      "Proportion: \n",
+      " Viral Pneumonia    0.463364\n",
+      "NORMAL             0.461300\n",
+      "COVID-19           0.075335\n",
+      "Name: CLASS, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"METADATA:\")\n",
+    "print(\"Total frecuencies: \\n\",df.CLASS.value_counts())\n",
+    "print(\"Proportion: \\n\",df.CLASS.value_counts()/df.shape[0])\n",
+    "print(\"----------------------------------------------\")\n",
+    "print(\"TRAIN:\")\n",
+    "print(\"Total frecuencies: \\n\",train.CLASS.value_counts())\n",
+    "print(\"Proportion: \\n\",train.CLASS.value_counts()/train.shape[0])\n",
+    "print(\"----------------------------------------------\")\n",
+    "print(\"TEST:\")\n",
+    "print(\"Total frecuencies: \\n\",test.CLASS.value_counts())\n",
+    "print(\"Proportion: \\n\",test.CLASS.value_counts()/test.shape[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Comprobamos que las proporciones se mantienen, hemos realizado un muestreo balanceado."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(train.shape[0]):\n",
+    "    #train.iloc[i,1] = CLASE\n",
+    "    #train.iloc[i,0] = FILE NAME\n",
+    "    src = os.path.join(\"../datos\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n",
+    "    dst = os.path.join(\"../datos/train\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n",
+    "    shutil.copyfile(src,dst)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(test.shape[0]):\n",
+    "    #train.iloc[i,1] = CLASE\n",
+    "    #train.iloc[i,0] = FILE NAME\n",
+    "    src = os.path.join(\"../datos\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n",
+    "    dst = os.path.join(\"../datos/test\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n",
+    "    shutil.copyfile(src,dst)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:markdown id: tags:
+
+Importamos modulos
+
+%% Cell type:code id: tags:
+
+``` python
+import os, shutil
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+```
+
+%% Cell type:markdown id: tags:
+
+Lectura de datos (metadatos)
+
+%% Cell type:code id: tags:
+
+``` python
+metadata = pd.read_csv('../datos/Metadata/metadatos.csv')
+metadata.head()
+```
+
+%% Output
+
+         FILE NAME     CLASE
+    0  COVID-19(1)  COVID-19
+    1  COVID-19(2)  COVID-19
+    2  COVID-19(3)  COVID-19
+    3  COVID-19(4)  COVID-19
+    4  COVID-19(5)  COVID-19
+
+%% Cell type:markdown id: tags:
+
+Separamos en entrenamiento y prueba
+
+%% Cell type:code id: tags:
+
+``` python
+train, test = train_test_split(metadata, test_size=1/3, stratify=metadata.CLASE)
+```
+
+%% Cell type:markdown id: tags:
+
+Observamos las mismas proporciones en todos los subconjuntos
+
+%% Cell type:markdown id: tags:
+
+Creamos en el directorio datos los directorios train y test donde almacenaremos las imagenes correspondientes
+
+%% Cell type:markdown id: tags:
+
+Intentamos obtener los nombres reales de las imagenes
+
+%% Cell type:code id: tags:
+
+``` python
+mypath = "../datos/COVID-19"
+covid_files = [f for f in os.listdir(mypath)]
+#covid_files
+```
+
+%% Cell type:code id: tags:
+
+``` python
+mypath = "../datos/NORMAL"
+normal_files = [f for f in os.listdir(mypath)]
+#normal_files
+```
+
+%% Cell type:code id: tags:
+
+``` python
+mypath = "../datos/Viral Pneumonia"
+pneumonia_files = [f for f in os.listdir(mypath)]
+#pneumonia_files
+```
+
+%% Cell type:code id: tags:
+
+``` python
+files= normal_files+pneumonia_files+covid_files
+#files.append(pneumonia_files)
+#files.append(covid_files)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+clase_n = ['NORMAL' for i in range(0,len(normal_files))]
+clase_p = ['Viral Pneumonia' for i in range(0,len(pneumonia_files))]
+clase_c = ['COVID-19' for i in range(0,len(covid_files))]
+
+clase = clase_n+clase_p+clase_c
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df = pd.DataFrame(list(zip(files, clase)),
+               columns =['FILENAME', 'CLASS'])
+df
+```
+
+%% Output
+
+                   FILENAME     CLASS
+    0        NORMAL (1).png    NORMAL
+    1       NORMAL (10).png    NORMAL
+    2      NORMAL (100).png    NORMAL
+    3      NORMAL (101).png    NORMAL
+    4      NORMAL (102).png    NORMAL
+    ...                 ...       ...
+    2900  COVID-19(215).png  COVID-19
+    2901  COVID-19(216).png  COVID-19
+    2902  COVID-19(217).png  COVID-19
+    2903  COVID-19(218).png  COVID-19
+    2904  COVID-19(219).png  COVID-19
+    
+    [2905 rows x 2 columns]
+
+%% Cell type:markdown id: tags:
+
+Separacion en entrenamiento y prueba
+
+%% Cell type:code id: tags:
+
+``` python
+train, test = train_test_split(df, test_size=1/3, stratify=df.CLASS)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print("METADATA:")
+print("Total frecuencies: \n",df.CLASS.value_counts())
+print("Proportion: \n",df.CLASS.value_counts()/df.shape[0])
+print("----------------------------------------------")
+print("TRAIN:")
+print("Total frecuencies: \n",train.CLASS.value_counts())
+print("Proportion: \n",train.CLASS.value_counts()/train.shape[0])
+print("----------------------------------------------")
+print("TEST:")
+print("Total frecuencies: \n",test.CLASS.value_counts())
+print("Proportion: \n",test.CLASS.value_counts()/test.shape[0])
+```
+
+%% Output
+
+    METADATA:
+    Total frecuencies:
+     Viral Pneumonia    1345
+    NORMAL             1341
+    COVID-19            219
+    Name: CLASS, dtype: int64
+    Proportion:
+     Viral Pneumonia    0.462995
+    NORMAL             0.461618
+    COVID-19           0.075387
+    Name: CLASS, dtype: float64
+    ----------------------------------------------
+    TRAIN:
+    Total frecuencies:
+     Viral Pneumonia    896
+    NORMAL             894
+    COVID-19           146
+    Name: CLASS, dtype: int64
+    Proportion:
+     Viral Pneumonia    0.462810
+    NORMAL             0.461777
+    COVID-19           0.075413
+    Name: CLASS, dtype: float64
+    ----------------------------------------------
+    TEST:
+    Total frecuencies:
+     Viral Pneumonia    449
+    NORMAL             447
+    COVID-19            73
+    Name: CLASS, dtype: int64
+    Proportion:
+     Viral Pneumonia    0.463364
+    NORMAL             0.461300
+    COVID-19           0.075335
+    Name: CLASS, dtype: float64
+
+%% Cell type:markdown id: tags:
+
+Comprobamos que las proporciones se mantienen, hemos realizado un muestreo balanceado.
+
+%% Cell type:code id: tags:
+
+``` python
+for i in range(train.shape[0]):
+    #train.iloc[i,1] = CLASE
+    #train.iloc[i,0] = FILE NAME
+    src = os.path.join("../datos",str(train.iloc[i,1]),str(train.iloc[i,0]))
+    dst = os.path.join("../datos/train",str(train.iloc[i,1]),str(train.iloc[i,0]))
+    shutil.copyfile(src,dst)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+for i in range(test.shape[0]):
+    #train.iloc[i,1] = CLASE
+    #train.iloc[i,0] = FILE NAME
+    src = os.path.join("../datos",str(test.iloc[i,1]),str(test.iloc[i,0]))
+    dst = os.path.join("../datos/test",str(test.iloc[i,1]),str(test.iloc[i,0]))
+    shutil.copyfile(src,dst)
+```