SICK dataset added

c583fca6 · pabvald · 2c8043df · 2c8043df · c583fca6 · c583fca6
Commit c583fca6 authored Feb 27, 2020 by pabvald
--- a/Untitled.ipynb
+++ b/Untitled.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from load import load_file\n",
-    "import os\n",
-    "import io"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def load_SICK(path, verbose=False):\n",
-    "    if verbose:\n",
-    "        print('***** Task: SICK-Relatedness*****\\n\\n')\n",
-    "    sick_train = load_file_SICK(os.path.join(path, 'SICK_train.txt'))\n",
-    "    sick_dev = load_file_SICK(os.path.join(path, 'SICK_trial.txt'))\n",
-    "    sick_test = load_file_SICK(os.path.join(path, 'SICK_test_annotated.txt'))\n",
-    "    \n",
-    "    return sick_train\n",
-    "\n",
-    "def load_file_SICK(path):\n",
-    "    skipFirstLine = True\n",
-    "    sick_data = {'sent_1': [], 'sent_2': [], 'sim': []}\n",
-    "    with io.open(path, 'r', encoding='utf-8') as f:\n",
-    "        for line in f:\n",
-    "            if skipFirstLine:\n",
-    "                skipFirstLine = False\n",
-    "            else:\n",
-    "                text = line.strip().split('\\t')\n",
-    "                sick_data['sent_1'].append(text[1].split())\n",
-    "                sick_data['sent_2'].append(text[2].split())\n",
-    "                sick_data['sim'].append(text[3])\n",
-    "    sick_data['sim'] = [float(s) for s in sick_data['sim']]\n",
-    "    return sick_data\n",
-    "\n",
-    "    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "a = load_SICK('./data/datasets/SICK/')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
-%% Cell type:code id: tags:
-
-``` python
-from load import load_file
-import os
-import io
-```
-
-%% Cell type:code id: tags:
-
-``` python
-def load_SICK(path, verbose=False):
-    if verbose:
-        print('***** Task: SICK-Relatedness*****\n\n')
-    sick_train = load_file_SICK(os.path.join(path, 'SICK_train.txt'))
-    sick_dev = load_file_SICK(os.path.join(path, 'SICK_trial.txt'))
-    sick_test = load_file_SICK(os.path.join(path, 'SICK_test_annotated.txt'))
-
-    return sick_train
-
-def load_file_SICK(path):
-    skipFirstLine = True
-    sick_data = {'sent_1': [], 'sent_2': [], 'sim': []}
-    with io.open(path, 'r', encoding='utf-8') as f:
-        for line in f:
-            if skipFirstLine:
-                skipFirstLine = False
-            else:
-                text = line.strip().split('\t')
-                sick_data['sent_1'].append(text[1].split())
-                sick_data['sent_2'].append(text[2].split())
-                sick_data['sim'].append(text[3])
-    sick_data['sim'] = [float(s) for s in sick_data['sim']]
-    return sick_data
-
-
-```
-
-%% Cell type:code id: tags:
-
-``` python
-a = load_SICK('./data/datasets/SICK/')
-```
-
-%% Cell type:code id: tags:
-
-``` python
-```
--- a/evaluation.ipynb
+++ b/evaluation.ipynb
--- a/load.py
+++ b/load.py
@@ -14,6 +14,7 @@


 import io
+import os
 import csv
 import numpy as np
 from utils import preprocess
@@ -29,8 +30,8 @@ def load_frequencies(path):
        
    return frequencies

-def load_file(path, datasets, preprocessing, verbose=False):
-    """ Loads and STS file and pre-processes its sentences """
+def load_file_STS(path, datasets, preprocessing, verbose=False):
+    """ Loads a STS test file and preprocesses its sentences """
    data = {}

    for dataset in datasets:
@@ -65,43 +66,81 @@ def load_file(path, datasets, preprocessing, verbose=False):
        
    return data

+def load_SICK(path, preprocessing, verbose=False):
+    """ Loads the SICK train, dev and test files and preprocess its sentences """
+    if verbose:
+        print('\n\n***** Task: SICK-Relatedness*****\n')
+    sick_all = {}
+    sick_train = load_file_SICK(os.path.join(path, 'SICK_train.txt'), preprocessing)
+    sick_dev = load_file_SICK(os.path.join(path, 'SICK_trial.txt'), preprocessing)
+    sick_test = load_file_SICK(os.path.join(path, 'SICK_test_annotated.txt'), preprocessing)
+    
+    sick_all['train'] = sick_train
+    sick_all['test'] = sick_test
+    sick_all['dev'] = sick_dev
+    
+    return sick_all
+
+def load_file_SICK(path, preprocessing):
+    """ Loads a SICK file and preprocess its sentences """
+    skipFirstLine = True
+    sent1 = []
+    sent2 = []
+    sim = []
+    # Read file
+    with io.open(path, 'r', encoding='utf-8') as f:
+        for line in f:
+            if skipFirstLine:
+                skipFirstLine = False
+            else:
+                text = line.strip().split('\t')
+                sent1.append(text[1])
+                sent2.append(text[2])
+                sim.append(text[3])
+
+    sent1 = preprocess(sent1, **preprocessing)
+    sent2 = preprocess(sent2, **preprocessing)
+    sim = [float(s) for s in sim]
+    
+    return (sent1, sent2, sim)
+
 def load_sts_12(path, preprocessing, verbose=False):
    """ Loads the SemEval-2012's Semantic Textual Similarity task"""
    if verbose:
-        print('\n***** TASK: STS12 *****\n')
+        print('\n\n***** TASK: STS12 *****\n')
    datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',
                        'surprise.OnWN', 'surprise.SMTnews']
-    return load_file('{}/STS12-en-test'.format(path), datasets, preprocessing, verbose=verbose)
+    return load_file_STS('{}/STS12-en-test'.format(path), datasets, preprocessing, verbose=verbose)

 def load_sts_13(path, preprocessing, verbose=False):
    """ Loads the SemEval-2013's Semantic Textual Similarity task"""
    # STS13 here does not contain the "SMT" subtask due to LICENSE issue
    if verbose:
-        print('\n***** TASK: STS13 (-SMT) ***\n\n')
+        print('\n\n***** TASK: STS13 (-SMT) ***\n')
    datasets = ['FNWN', 'headlines', 'OnWN']
-    return load_file('{}/STS13-en-test'.format(path), datasets, preprocessing, verbose=verbose)
+    return load_file_STS('{}/STS13-en-test'.format(path), datasets, preprocessing, verbose=verbose)

 def load_sts_14(path, preprocessing, verbose=False):
    """ Loads the SemEval-2014's Semantic Textual Similarity task"""
    if verbose:
-        print('\n***** TASK: STS14 *****\n')
+        print('\n\n***** TASK: STS14 *****\n')
    datasets = ['deft-forum', 'deft-news', 'headlines',
                        'images', 'OnWN', 'tweet-news']
-    return load_file('{}/STS14-en-test'.format(path), datasets, preprocessing, verbose=verbose)
+    return load_file_STS('{}/STS14-en-test'.format(path), datasets, preprocessing, verbose=verbose)

 def load_sts_15(path, preprocessing, verbose=False):
    """ Loads the SemEval-2015's Semantic Textual Similarity task"""
    if verbose:
-        print('\n***** TASK: STS15 *****\n')
+        print('\n\n***** TASK: STS15 *****\n')
    datasets = ['answers-forums', 'answers-students',
                        'belief', 'headlines', 'images']
-    return load_file('{}/STS15-en-test'.format(path), datasets, preprocessing, verbose=verbose)
+    return load_file_STS('{}/STS15-en-test'.format(path), datasets, preprocessing, verbose=verbose)

 def load_sts_16(path, preprocessing, verbose=False):
    """ Loads the SemEval-2016's Semantic Textual Similarity task"""
    if verbose:
-        print('\n***** TASK: STS16 *****\n')
+        print('\n\n***** TASK: STS16 *****\n')
    datasets = ['answer-answer', 'headlines', 'plagiarism',
                        'postediting', 'question-question']
-    return load_file('{}/STS16-en-test'.format(path), datasets, preprocessing, verbose=verbose)
+    return load_file_STS('{}/STS16-en-test'.format(path), datasets, preprocessing, verbose=verbose)

--- a/results.ods
+++ b/results.ods