Skip to content
Snippets Groups Projects
Commit c583fca6 authored by pabvald's avatar pabvald
Browse files

SICK dataset added

parent 2c8043df
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
from load import load_file
import os
import io
```
%% Cell type:code id: tags:
``` python
def load_SICK(path, verbose=False):
if verbose:
print('***** Task: SICK-Relatedness*****\n\n')
sick_train = load_file_SICK(os.path.join(path, 'SICK_train.txt'))
sick_dev = load_file_SICK(os.path.join(path, 'SICK_trial.txt'))
sick_test = load_file_SICK(os.path.join(path, 'SICK_test_annotated.txt'))
return sick_train
def load_file_SICK(path):
skipFirstLine = True
sick_data = {'sent_1': [], 'sent_2': [], 'sim': []}
with io.open(path, 'r', encoding='utf-8') as f:
for line in f:
if skipFirstLine:
skipFirstLine = False
else:
text = line.strip().split('\t')
sick_data['sent_1'].append(text[1].split())
sick_data['sent_2'].append(text[2].split())
sick_data['sim'].append(text[3])
sick_data['sim'] = [float(s) for s in sick_data['sim']]
return sick_data
```
%% Cell type:code id: tags:
``` python
a = load_SICK('./data/datasets/SICK/')
```
%% Cell type:code id: tags:
``` python
```
This diff is collapsed.
......@@ -14,6 +14,7 @@
import io
import os
import csv
import numpy as np
from utils import preprocess
......@@ -29,8 +30,8 @@ def load_frequencies(path):
return frequencies
def load_file(path, datasets, preprocessing, verbose=False):
""" Loads and STS file and pre-processes its sentences """
def load_file_STS(path, datasets, preprocessing, verbose=False):
""" Loads a STS test file and preprocesses its sentences """
data = {}
for dataset in datasets:
......@@ -65,43 +66,81 @@ def load_file(path, datasets, preprocessing, verbose=False):
return data
def load_SICK(path, preprocessing, verbose=False):
""" Loads the SICK train, dev and test files and preprocess its sentences """
if verbose:
print('\n\n***** Task: SICK-Relatedness*****\n')
sick_all = {}
sick_train = load_file_SICK(os.path.join(path, 'SICK_train.txt'), preprocessing)
sick_dev = load_file_SICK(os.path.join(path, 'SICK_trial.txt'), preprocessing)
sick_test = load_file_SICK(os.path.join(path, 'SICK_test_annotated.txt'), preprocessing)
sick_all['train'] = sick_train
sick_all['test'] = sick_test
sick_all['dev'] = sick_dev
return sick_all
def load_file_SICK(path, preprocessing):
""" Loads a SICK file and preprocess its sentences """
skipFirstLine = True
sent1 = []
sent2 = []
sim = []
# Read file
with io.open(path, 'r', encoding='utf-8') as f:
for line in f:
if skipFirstLine:
skipFirstLine = False
else:
text = line.strip().split('\t')
sent1.append(text[1])
sent2.append(text[2])
sim.append(text[3])
sent1 = preprocess(sent1, **preprocessing)
sent2 = preprocess(sent2, **preprocessing)
sim = [float(s) for s in sim]
return (sent1, sent2, sim)
def load_sts_12(path, preprocessing, verbose=False):
""" Loads the SemEval-2012's Semantic Textual Similarity task"""
if verbose:
print('\n***** TASK: STS12 *****\n')
print('\n\n***** TASK: STS12 *****\n')
datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',
'surprise.OnWN', 'surprise.SMTnews']
return load_file('{}/STS12-en-test'.format(path), datasets, preprocessing, verbose=verbose)
return load_file_STS('{}/STS12-en-test'.format(path), datasets, preprocessing, verbose=verbose)
def load_sts_13(path, preprocessing, verbose=False):
""" Loads the SemEval-2013's Semantic Textual Similarity task"""
# STS13 here does not contain the "SMT" subtask due to LICENSE issue
if verbose:
print('\n***** TASK: STS13 (-SMT) ***\n\n')
print('\n\n***** TASK: STS13 (-SMT) ***\n')
datasets = ['FNWN', 'headlines', 'OnWN']
return load_file('{}/STS13-en-test'.format(path), datasets, preprocessing, verbose=verbose)
return load_file_STS('{}/STS13-en-test'.format(path), datasets, preprocessing, verbose=verbose)
def load_sts_14(path, preprocessing, verbose=False):
""" Loads the SemEval-2014's Semantic Textual Similarity task"""
if verbose:
print('\n***** TASK: STS14 *****\n')
print('\n\n***** TASK: STS14 *****\n')
datasets = ['deft-forum', 'deft-news', 'headlines',
'images', 'OnWN', 'tweet-news']
return load_file('{}/STS14-en-test'.format(path), datasets, preprocessing, verbose=verbose)
return load_file_STS('{}/STS14-en-test'.format(path), datasets, preprocessing, verbose=verbose)
def load_sts_15(path, preprocessing, verbose=False):
""" Loads the SemEval-2015's Semantic Textual Similarity task"""
if verbose:
print('\n***** TASK: STS15 *****\n')
print('\n\n***** TASK: STS15 *****\n')
datasets = ['answers-forums', 'answers-students',
'belief', 'headlines', 'images']
return load_file('{}/STS15-en-test'.format(path), datasets, preprocessing, verbose=verbose)
return load_file_STS('{}/STS15-en-test'.format(path), datasets, preprocessing, verbose=verbose)
def load_sts_16(path, preprocessing, verbose=False):
""" Loads the SemEval-2016's Semantic Textual Similarity task"""
if verbose:
print('\n***** TASK: STS16 *****\n')
print('\n\n***** TASK: STS16 *****\n')
datasets = ['answer-answer', 'headlines', 'plagiarism',
'postediting', 'question-question']
return load_file('{}/STS16-en-test'.format(path), datasets, preprocessing, verbose=verbose)
return load_file_STS('{}/STS16-en-test'.format(path), datasets, preprocessing, verbose=verbose)
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment