{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Importamos modulos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, shutil\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lectura de datos (metadatos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FILE NAME</th>\n",
       "      <th>CLASE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>COVID-19(1)</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>COVID-19(2)</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>COVID-19(3)</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>COVID-19(4)</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>COVID-19(5)</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     FILE NAME     CLASE\n",
       "0  COVID-19(1)  COVID-19\n",
       "1  COVID-19(2)  COVID-19\n",
       "2  COVID-19(3)  COVID-19\n",
       "3  COVID-19(4)  COVID-19\n",
       "4  COVID-19(5)  COVID-19"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata = pd.read_csv('../datos/Metadata/metadatos.csv')\n",
    "metadata.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Separamos en entrenamiento y prueba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(metadata, test_size=1/3, stratify=metadata.CLASE)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Observamos las mismas proporciones en todos los subconjuntos"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Creamos en el directorio datos los directorios train y test donde almacenaremos las imagenes correspondientes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Intentamos obtener los nombres reales de las imagenes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "mypath = \"../datos/COVID-19\"\n",
    "covid_files = [f for f in os.listdir(mypath)]\n",
    "#covid_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "mypath = \"../datos/NORMAL\"\n",
    "normal_files = [f for f in os.listdir(mypath)]\n",
    "#normal_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "mypath = \"../datos/Viral Pneumonia\"\n",
    "pneumonia_files = [f for f in os.listdir(mypath)]\n",
    "#pneumonia_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "files= normal_files+pneumonia_files+covid_files\n",
    "#files.append(pneumonia_files)\n",
    "#files.append(covid_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "clase_n = ['NORMAL' for i in range(0,len(normal_files))]\n",
    "clase_p = ['Viral Pneumonia' for i in range(0,len(pneumonia_files))]\n",
    "clase_c = ['COVID-19' for i in range(0,len(covid_files))]\n",
    "\n",
    "clase = clase_n+clase_p+clase_c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>FILENAME</th>\n",
       "      <th>CLASS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NORMAL (1).png</td>\n",
       "      <td>NORMAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NORMAL (10).png</td>\n",
       "      <td>NORMAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NORMAL (100).png</td>\n",
       "      <td>NORMAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NORMAL (101).png</td>\n",
       "      <td>NORMAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NORMAL (102).png</td>\n",
       "      <td>NORMAL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2900</th>\n",
       "      <td>COVID-19(215).png</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2901</th>\n",
       "      <td>COVID-19(216).png</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2902</th>\n",
       "      <td>COVID-19(217).png</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2903</th>\n",
       "      <td>COVID-19(218).png</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2904</th>\n",
       "      <td>COVID-19(219).png</td>\n",
       "      <td>COVID-19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2905 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               FILENAME     CLASS\n",
       "0        NORMAL (1).png    NORMAL\n",
       "1       NORMAL (10).png    NORMAL\n",
       "2      NORMAL (100).png    NORMAL\n",
       "3      NORMAL (101).png    NORMAL\n",
       "4      NORMAL (102).png    NORMAL\n",
       "...                 ...       ...\n",
       "2900  COVID-19(215).png  COVID-19\n",
       "2901  COVID-19(216).png  COVID-19\n",
       "2902  COVID-19(217).png  COVID-19\n",
       "2903  COVID-19(218).png  COVID-19\n",
       "2904  COVID-19(219).png  COVID-19\n",
       "\n",
       "[2905 rows x 2 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(list(zip(files, clase)), \n",
    "               columns =['FILENAME', 'CLASS']) \n",
    "df "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Separacion en entrenamiento y prueba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(df, test_size=1/3, stratify=df.CLASS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "METADATA:\n",
      "Total frecuencies: \n",
      " Viral Pneumonia    1345\n",
      "NORMAL             1341\n",
      "COVID-19            219\n",
      "Name: CLASS, dtype: int64\n",
      "Proportion: \n",
      " Viral Pneumonia    0.462995\n",
      "NORMAL             0.461618\n",
      "COVID-19           0.075387\n",
      "Name: CLASS, dtype: float64\n",
      "----------------------------------------------\n",
      "TRAIN:\n",
      "Total frecuencies: \n",
      " Viral Pneumonia    896\n",
      "NORMAL             894\n",
      "COVID-19           146\n",
      "Name: CLASS, dtype: int64\n",
      "Proportion: \n",
      " Viral Pneumonia    0.462810\n",
      "NORMAL             0.461777\n",
      "COVID-19           0.075413\n",
      "Name: CLASS, dtype: float64\n",
      "----------------------------------------------\n",
      "TEST:\n",
      "Total frecuencies: \n",
      " Viral Pneumonia    449\n",
      "NORMAL             447\n",
      "COVID-19            73\n",
      "Name: CLASS, dtype: int64\n",
      "Proportion: \n",
      " Viral Pneumonia    0.463364\n",
      "NORMAL             0.461300\n",
      "COVID-19           0.075335\n",
      "Name: CLASS, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(\"METADATA:\")\n",
    "print(\"Total frecuencies: \\n\",df.CLASS.value_counts())\n",
    "print(\"Proportion: \\n\",df.CLASS.value_counts()/df.shape[0])\n",
    "print(\"----------------------------------------------\")\n",
    "print(\"TRAIN:\")\n",
    "print(\"Total frecuencies: \\n\",train.CLASS.value_counts())\n",
    "print(\"Proportion: \\n\",train.CLASS.value_counts()/train.shape[0])\n",
    "print(\"----------------------------------------------\")\n",
    "print(\"TEST:\")\n",
    "print(\"Total frecuencies: \\n\",test.CLASS.value_counts())\n",
    "print(\"Proportion: \\n\",test.CLASS.value_counts()/test.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Comprobamos que las proporciones se mantienen, hemos realizado un muestreo balanceado."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(train.shape[0]):\n",
    "    #train.iloc[i,1] = CLASE\n",
    "    #train.iloc[i,0] = FILE NAME\n",
    "    src = os.path.join(\"../datos\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n",
    "    dst = os.path.join(\"../datos/train\",str(train.iloc[i,1]),str(train.iloc[i,0]))\n",
    "    shutil.copyfile(src,dst)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(test.shape[0]):\n",
    "    #train.iloc[i,1] = CLASE\n",
    "    #train.iloc[i,0] = FILE NAME\n",
    "    src = os.path.join(\"../datos\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n",
    "    dst = os.path.join(\"../datos/test\",str(test.iloc[i,1]),str(test.iloc[i,0]))\n",
    "    shutil.copyfile(src,dst)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}