AVG and WMD

bcf55cd2 · pabvald · 008b193e · bcf55cd2 · bcf55cd2 · bcf55cd2
Commit bcf55cd2 authored Feb 24, 2020 by pabvald
--- a/.gitignore
+++ b/.gitignore
-evaluations/word_embeddings
+data/embedding/*
-evaluations/__pycache__
+data/datasets/SICK
-senteval/__pycache__
+data/datasets/STS
-.venv
+.*
-.vscode
+__pycache__
\ No newline at end of file
--- a/data/datasets/get_transfer_data.bash
+++ b/data/datasets/get_transfer_data.bash
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#
+# Download and tokenize data with MOSES tokenizer
+#
+data_path=.
+preprocess_exec=./tokenizer.sed
+# Get MOSES
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+SCRIPTS=mosesdecoder/scripts
+MTOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+LOWER=$SCRIPTS/tokenizer/lowercase.perl
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+PTBTOKENIZER="sed -f tokenizer.sed"
+mkdir $data_path
+TREC='http://cogcomp.cs.illinois.edu/Data/QA/QC'
+SICK='http://alt.qcri.org/semeval2014/task1/data/uploads'
+BINCLASSIF='https://dl.fbaipublicfiles.com/senteval/senteval_data/datasmall_NB_ACL12.zip'
+SSTbin='https://raw.githubusercontent.com/PrincetonML/SIF/master/data'
+SSTfine='https://raw.githubusercontent.com/AcademiaSinicaNLPLab/sentiment_dataset/master/data/'
+STSBenchmark='http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz'
+SNLI='https://nlp.stanford.edu/projects/snli/snli_1.0.zip'
+MULTINLI='https://www.nyu.edu/projects/bowman/multinli/multinli_0.9.zip'
+COCO='https://dl.fbaipublicfiles.com/senteval/coco_r101_feat'
+# MRPC is a special case (we use "cabextract" to extract the msi file on Linux, see below)
+MRPC='https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi'
+# STS 2012, 2013, 2014, 2015, 2016
+declare -A STS_tasks
+declare -A STS_paths
+declare -A STS_subdirs
+STS_tasks=(["STS12"]="MSRpar MSRvid SMTeuroparl surprise.OnWN surprise.SMTnews" ["STS13"]="FNWN headlines OnWN" ["STS14"]="deft-forum deft-news headlines OnWN images tweet-news" ["STS15"]="answers-forums answers-students belief headlines images" ["STS16"]="answer-answer headlines plagiarism postediting question-question")
+STS_paths=(["STS12"]="http://ixa2.si.ehu.es/stswiki/images/4/40/STS2012-en-test.zip" ["STS13"]="http://ixa2.si.ehu.es/stswiki/images/2/2f/STS2013-en-test.zip" ["STS14"]="http://ixa2.si.ehu.es/stswiki/images/8/8c/STS2014-en-test.zip" ["STS15"]="http://ixa2.si.ehu.es/stswiki/images/d/da/STS2015-en-test.zip"
+["STS16"]="http://ixa2.si.ehu.es/stswiki/images/9/98/STS2016-en-test.zip")
+STS_subdirs=(["STS12"]="test-gold" ["STS13"]="test-gs" ["STS14"]="sts-en-test-gs-2014" ["STS15"]="test_evaluation_task2a" ["STS16"]="sts2016-english-with-gs-v1.0")
+### Get Stanford Sentiment Treebank (SST) binary classification task
+# SST binary
+mkdir -p $data_path/SST/binary
+for split in train dev test
+do
+    curl -Lo $data_path/SST/binary/sentiment-$split $SSTbin/sentiment-$split
+done
+# SST fine-grained
+mkdir -p $data_path/SST/fine/
+for split in train dev test
+do
+  curl -Lo $data_path/SST/fine/sentiment-$split $SSTfine/stsa.fine.$split
+done
+### STS datasets
+# STS12, STS13, STS14, STS15, STS16
+mkdir $data_path/STS
+for task in "${!STS_tasks[@]}"; #"${!STS_tasks[@]}";
+do
+    fpath=${STS_paths[$task]}
+    echo $fpath
+    curl -Lo $data_path/STS/data_$task.zip $fpath
+    unzip $data_path/STS/data_$task.zip -d $data_path/STS
+    mv $data_path/STS/${STS_subdirs[$task]} $data_path/STS/$task-en-test
+    rm $data_path/STS/data_$task.zip
+    for sts_task in ${STS_tasks[$task]}
+    do
+        fname=STS.input.$sts_task.txt
+        task_path=$data_path/STS/$task-en-test/
+        if [ "$task" = "STS16" ] ; then
+            echo 'Handling STS2016'
+            mv $task_path/STS2016.input.$sts_task.txt $task_path/$fname
+            mv $task_path/STS2016.gs.$sts_task.txt $task_path/STS.gs.$sts_task.txt
+        fi
+        cut -f1 $task_path/$fname | $MTOKENIZER -threads 8 -l en -no-escape | $LOWER > $task_path/tmp1
+        cut -f2 $task_path/$fname | $MTOKENIZER -threads 8 -l en -no-escape | $LOWER > $task_path/tmp2
+        paste $task_path/tmp1 $task_path/tmp2 > $task_path/$fname
+        rm $task_path/tmp1 $task_path/tmp2
+    done
+done
+# STSBenchmark (http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark)
+curl -Lo $data_path/Stsbenchmark.tar.gz $STSBenchmark
+tar -zxvf $data_path/Stsbenchmark.tar.gz -C $data_path
+rm $data_path/Stsbenchmark.tar.gz
+mv $data_path/stsbenchmark $data_path/STS/STSBenchmark
+for split in train dev test
+do
+    fname=sts-$split.csv
+    fdir=$data_path/STS/STSBenchmark
+    cut -f1,2,3,4,5 $fdir/$fname > $fdir/tmp1
+    cut -f6 $fdir/$fname | $MTOKENIZER -threads 8 -l en -no-escape | $LOWER > $fdir/tmp2
+    cut -f7 $fdir/$fname | $MTOKENIZER -threads 8 -l en -no-escape | $LOWER > $fdir/tmp3
+    paste $fdir/tmp1 $fdir/tmp2 $fdir/tmp3 > $fdir/$fname
+    rm $fdir/tmp1 $fdir/tmp2 $fdir/tmp3
+done
+### download TREC
+mkdir $data_path/TREC
+for split in train_5500 TREC_10
+do
+    urlname=$TREC/$split.label
+    curl -Lo $data_path/TREC/$split.label $urlname
+    sed -i -e "s/\`//g" $data_path/TREC/$split.label
+    sed -i -e "s/'//g" $data_path/TREC/$split.label
+done
+### download SICK
+mkdir $data_path/SICK
+for split in train trial test_annotated
+do
+    urlname=$SICK/sick_$split.zip
+    curl -Lo $data_path/SICK/sick_$split.zip $urlname
+    unzip $data_path/SICK/sick_$split.zip -d $data_path/SICK/
+    rm $data_path/SICK/readme.txt
+    rm $data_path/SICK/sick_$split.zip
+done
+for split in train trial test_annotated
+do
+    fname=$data_path/SICK/SICK_$split.txt
+    cut -f1 $fname | sed '1d' > $data_path/SICK/tmp1
+    cut -f4,5 $fname | sed '1d' > $data_path/SICK/tmp45
+    cut -f2 $fname | sed '1d' | $MTOKENIZER -threads 8 -l en -no-escape > $data_path/SICK/tmp2
+    cut -f3 $fname | sed '1d' | $MTOKENIZER -threads 8 -l en -no-escape > $data_path/SICK/tmp3
+    head -n 1 $fname > $data_path/SICK/tmp0
+    paste $data_path/SICK/tmp1 $data_path/SICK/tmp2 $data_path/SICK/tmp3 $data_path/SICK/tmp45 >> $data_path/SICK/tmp0
+    mv $data_path/SICK/tmp0 $fname
+    rm $data_path/SICK/tmp*
+done
+### download MR CR SUBJ MPQA
+# Download and unzip file
+curl -Lo $data_path/data_classif.zip $BINCLASSIF
+unzip $data_path/data_classif.zip -d $data_path/data_bin_classif
+rm $data_path/data_classif.zip
+# MR
+mkdir $data_path/MR
+cat -v $data_path/data_bin_classif/data/rt10662/rt-polarity.pos | $PTBTOKENIZER > $data_path/MR/rt-polarity.pos
+cat -v $data_path/data_bin_classif/data/rt10662/rt-polarity.neg | $PTBTOKENIZER > $data_path/MR/rt-polarity.neg
+# CR
+mkdir $data_path/CR
+cat -v $data_path/data_bin_classif/data/customerr/custrev.pos | $PTBTOKENIZER > $data_path/CR/custrev.pos
+cat -v $data_path/data_bin_classif/data/customerr/custrev.neg | $PTBTOKENIZER > $data_path/CR/custrev.neg
+# SUBJ
+mkdir $data_path/SUBJ
+cat -v $data_path/data_bin_classif/data/subj/subj.subjective | $PTBTOKENIZER > $data_path/SUBJ/subj.subjective
+cat -v $data_path/data_bin_classif/data/subj/subj.objective | $PTBTOKENIZER > $data_path/SUBJ/subj.objective
+# MPQA
+mkdir $data_path/MPQA
+cat -v $data_path/data_bin_classif/data/mpqa/mpqa.pos | $PTBTOKENIZER > $data_path/MPQA/mpqa.pos
+cat -v $data_path/data_bin_classif/data/mpqa/mpqa.neg | $PTBTOKENIZER > $data_path/MPQA/mpqa.neg
+# CLEAN-UP
+rm -r $data_path/data_bin_classif
+### download SNLI
+mkdir $data_path/SNLI
+curl -Lo $data_path/SNLI/snli_1.0.zip $SNLI
+unzip $data_path/SNLI/snli_1.0.zip -d $data_path/SNLI
+rm $data_path/SNLI/snli_1.0.zip
+rm -r $data_path/SNLI/__MACOSX
+for split in train dev test
+do
+    fpath=$data_path/SNLI/$split.snli.txt
+    awk '{ if ( $1 != "-" ) { print $0; } }' $data_path/SNLI/snli_1.0/snli_1.0_$split.txt | cut -f 1,6,7 | sed '1d' > $fpath
+    cut -f1 $fpath > $data_path/SNLI/labels.$split
+    cut -f2 $fpath | $PTBTOKENIZER > $data_path/SNLI/s1.$split
+    cut -f3 $fpath | $PTBTOKENIZER > $data_path/SNLI/s2.$split
+    rm $fpath
+done
+rm -r $data_path/SNLI/snli_1.0
+### Get COCO captions and resnet-101 2048d-features
+# Captions : Copyright (c) 2015, COCO Consortium. All rights reserved.
+mkdir $data_path/COCO
+for split in train valid test
+do
+    curl -Lo $data_path/COCO/$split.pkl $COCO/$split.pkl
+done
+### download MRPC
+mkdir $data_path/MRPC
+curl -Lo $data_path/MRPC/msr_paraphrase_train.txt https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt
+curl -Lo $data_path/MRPC/msr_paraphrase_test.txt https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt
+# remove moses folder
+rm -rf mosesdecoder
--- a/data/datasets/tokenizer.sed
+++ b/data/datasets/tokenizer.sed
+# Sed script to produce Penn Treebank tokenization on arbitrary raw text.
+# Yeah, sure.
+# expected input: raw text with ONE SENTENCE TOKEN PER LINE
+# by Robert MacIntyre, University of Pennsylvania, late 1995.
+# If this wasn't such a trivial program, I'd include all that stuff about
+# no warrantee, free use, etc. from the GNU General Public License.  If you
+# want to be picky, assume that all of its terms apply.  Okay?
+# attempt to get correct directional quotes
+s=^"=`` =g
+s=\([ ([{<]\)"=\1 `` =g
+# close quotes handled at end
+s=\.\.\.= ... =g
+s=[,;:@#$%&]= & =g
+# Assume sentence tokenization has been done first, so split FINAL periods
+# only. 
+s=\([^.]\)\([.]\)\([])}>"']*\)[     ]*$=\1 \2\3 =g
+# however, we may as well split ALL question marks and exclamation points,
+# since they shouldn't have the abbrev.-marker ambiguity problem
+s=[?!]= & =g
+# parentheses, brackets, etc.
+s=[][(){}<>]= & =g
+# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
+# version of these symbols.
+# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
+# s/(/-LRB-/g
+# s/)/-RRB-/g
+# s/\[/-LSB-/g
+# s/\]/-RSB-/g
+# s/{/-LCB-/g
+# s/}/-RCB-/g
+s=--= -- =g
+# NOTE THAT SPLIT WORDS ARE NOT MARKED.  Obviously this isn't great, since
+# you might someday want to know how the words originally fit together --
+# but it's too late to make a better system now, given the millions of
+# words we've already done "wrong".
+# First off, add a space to the beginning and end of each line, to reduce
+# necessary number of regexps.
+s=$= =
+s=^= =
+s="= '' =g
+# possessive or close-single-quote
+s=\([^']\)' =\1 ' =g
+# as in it's, I'm, we'd
+s='\([sSmMdD]\) = '\1 =g
+s='ll = 'll =g
+s='re = 're =g
+s='ve = 've =g
+s=n't = n't =g
+s='LL = 'LL =g
+s='RE = 'RE =g
+s='VE = 'VE =g
+s=N'T = N'T =g
+s= \([Cc]\)annot = \1an not =g
+s= \([Dd]\)'ye = \1' ye =g
+s= \([Gg]\)imme = \1im me =g
+s= \([Gg]\)onna = \1on na =g
+s= \([Gg]\)otta = \1ot ta =g
+s= \([Ll]\)emme = \1em me =g
+s= \([Mm]\)ore'n = \1ore 'n =g
+s= '\([Tt]\)is = '\1 is =g
+s= '\([Tt]\)was = '\1 was =g
+s= \([Ww]\)anna = \1an na =g
+# s= \([Ww]\)haddya = \1ha dd ya =g
+# s= \([Ww]\)hatcha = \1ha t cha =g
+# clean out extra spaces
+s=  *= =g
+s=^ *==g
--- a/evaluation.ipynb
+++ b/evaluation.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate(task, methods, verbose=False):\n",
+    "    \"\"\" Computes the weigthed Pearson and Spearman correlations of a task \n",
+    "        using the specified methods\"\"\"\n",
+    "    pearson_correlations = {}\n",
+    "    spearman_correlations = {}\n",
+    "    \n",
+    "    for label, method in evaluations:\n",
+    "        task_pearson = []\n",
+    "        task_spearman = []\n",
+    "        task_weights = [] \n",
+    "        for dataset in task.keys():\n",
+    "            sentences1, sentences2, gs = task[dataset]\n",
+    "            task_weights.append(len(gs))\n",
+    "            sims = method(sentences1, sentences2)\n",
+    "            task_pearson.append(pearsonr(sims, gs)[0])\n",
+    "            task_spearman.append(spearmanr(sims, gs)[0])\n",
+    "\n",
+    "        wpearson = sum(task_pearson[i] * task_weights[i] / sum(task_weights) for i in range(len(task_weights)))\n",
+    "        wspearman =  sum(task_spearman[i] * task_weights[i] / sum(task_weights) for i in range(len(task_weights)))\n",
+    "       \n",
+    "        pearson_correlations[label] = wpearson\n",
+    "        spearman_correlations[label] = wspearman\n",
+    "        \n",
+    "    return pearson_correlations, spearman_correlations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modelos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.models import KeyedVectors "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PATH_WORD2VEC = './data/embedding/word2vec/GoogleNews-vectors-negative300.bin'\n",
+    "PATH_GLOVE = './data/embedding/glove/glove.840B.300d.w2v.txt'\n",
+    "PATH_FASTTEXT = './data/embedding/fasttext/crawl-300d-2M.vec'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word2vec = KeyedVectors.load_word2vec_format(PATH_WORD2VEC, binary=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "glove = KeyedVectors.load_word2vec_format(PATH_GLOVE, binary=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Datos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from load import loadSTS12, loadSTS13, loadSTS14, loadSTS15, loadSTS16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PATH_DATASETS = './data/datasets/STS'\n",
+    "PREPROCESSING =  {'lowercase':  False, \n",
+    "                  'stop_words': False, \n",
+    "                  'punctuation': False, \n",
+    "                  'only_ascii': False, \n",
+    "                  'lemmatization': False\n",
+    "                 }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "***** Transfer task : STS12 *****\n",
+      "\n",
+      "\n",
+      "Preprocessing -MSRpar-\n",
+      "-MSRpar- preprocessed correctly\n",
+      "Preprocessing -MSRvid-\n",
+      "-MSRvid- preprocessed correctly\n",
+      "Preprocessing -SMTeuroparl-\n",
+      "-SMTeuroparl- preprocessed correctly\n",
+      "Preprocessing -surprise.OnWN-\n",
+      "-surprise.OnWN- preprocessed correctly\n",
+      "Preprocessing -surprise.SMTnews-\n",
+      "-surprise.SMTnews- preprocessed correctly\n",
+      "***** Transfer task : STS13 (-SMT) *****\n",
+      "\n",
+      "\n",
+      "Preprocessing -FNWN-\n",
+      "-FNWN- preprocessed correctly\n",
+      "Preprocessing -headlines-\n",
+      "-headlines- preprocessed correctly\n",
+      "Preprocessing -OnWN-\n",
+      "-OnWN- preprocessed correctly\n",
+      "***** Transfer task : STS14 *****\n",
+      "\n",
+      "\n",
+      "Preprocessing -deft-forum-\n",
+      "-deft-forum- preprocessed correctly\n",
+      "Preprocessing -deft-news-\n",
+      "-deft-news- preprocessed correctly\n",
+      "Preprocessing -headlines-\n",
+      "-headlines- preprocessed correctly\n",
+      "Preprocessing -images-\n",
+      "-images- preprocessed correctly\n",
+      "Preprocessing -OnWN-\n",
+      "-OnWN- preprocessed correctly\n",
+      "Preprocessing -tweet-news-\n",
+      "-tweet-news- preprocessed correctly\n",
+      "***** Transfer task : STS15 *****\n",
+      "\n",
+      "\n",
+      "Preprocessing -answers-forums-\n",
+      "-answers-forums- preprocessed correctly\n",
+      "Preprocessing -answers-students-\n",
+      "-answers-students- preprocessed correctly\n",
+      "Preprocessing -belief-\n",
+      "-belief- preprocessed correctly\n",
+      "Preprocessing -headlines-\n",
+      "-headlines- preprocessed correctly\n",
+      "Preprocessing -images-\n",
+      "-images- preprocessed correctly\n"
+     ]
+    }
+   ],
+   "source": [
+    "sts12 = loadSTS12(PATH_DATASETS, PREPROCESSING)\n",
+    "sts13 = loadSTS13(PATH_DATASETS, PREPROCESSING)\n",
+    "sts14 = loadSTS14(PATH_DATASETS, PREPROCESSING)\n",
+    "sts15 = loadSTS15(PATH_DATASETS, PREPROCESSING)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Métodos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functools import partial"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "METHODS = [\n",
+    "    (\"W2V + AVG\", partial(avg_cosine, model=word2vec)),\n",
+    "    (\"W2V + WMD\", partial(wmd, model=word2vec)),\n",
+    "    (\"GLOVE + AVG\", partial(avg_cosine, model=glove)),\n",
+    "    (\"GLOVE + WMD\", partial(wmd, model=glove)),\n",
+    "    #(\"FASTTEXT + AVG\", partial(avg_cosine, model=fasttext)),\n",
+    "    #(\"FASTTEX + WMD\", partial(wmd, model=fasttext))\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pearson, spearman = evaluate(sts12, METHODS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'W2V + AVG': 0.5576731761229754,\n",
+       " 'W2V + WMD': 0.4735133931943548,\n",
+       " 'GLOVE + AVG': 0.550325345521787,\n",
+       " 'GLOVE + WMD': 0.5511226507959358}"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pearson"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:markdown id: tags:
+## Imports
+%% Cell type:code id: tags:
+``` python
+```
+%% Cell type:code id: tags:
+``` python
+def evaluate(task, methods, verbose=False):
+    """ Computes the weigthed Pearson and Spearman correlations of a task
+        using the specified methods"""
+    pearson_correlations = {}
+    spearman_correlations = {}
+    for label, method in evaluations:
+        task_pearson = []
+        task_spearman = []
+        task_weights = []
+        for dataset in task.keys():
+            sentences1, sentences2, gs = task[dataset]
+            task_weights.append(len(gs))
+            sims = method(sentences1, sentences2)
+            task_pearson.append(pearsonr(sims, gs)[0])
+            task_spearman.append(spearmanr(sims, gs)[0])
+        wpearson = sum(task_pearson[i] * task_weights[i] / sum(task_weights) for i in range(len(task_weights)))
+        wspearman =  sum(task_spearman[i] * task_weights[i] / sum(task_weights) for i in range(len(task_weights)))
+        pearson_correlations[label] = wpearson
+        spearman_correlations[label] = wspearman
+    return pearson_correlations, spearman_correlations
+```
+%% Cell type:markdown id: tags:
+## Modelos
+%% Cell type:code id: tags:
+``` python
+from gensim.models import KeyedVectors
+```
+%% Cell type:code id: tags:
+``` python
+PATH_WORD2VEC = './data/embedding/word2vec/GoogleNews-vectors-negative300.bin'
+PATH_GLOVE = './data/embedding/glove/glove.840B.300d.w2v.txt'
+PATH_FASTTEXT = './data/embedding/fasttext/crawl-300d-2M.vec'
+```
+%% Cell type:code id: tags:
+``` python
+word2vec = KeyedVectors.load_word2vec_format(PATH_WORD2VEC, binary=True)
+```
+%% Cell type:code id: tags:
+``` python
+glove = KeyedVectors.load_word2vec_format(PATH_GLOVE, binary=False)
+```
+%% Cell type:markdown id: tags:
+## Datos
+%% Cell type:code id: tags:
+``` python
+from load import loadSTS12, loadSTS13, loadSTS14, loadSTS15, loadSTS16
+```
+%% Cell type:code id: tags:
+``` python
+PATH_DATASETS = './data/datasets/STS'
+PREPROCESSING =  {'lowercase':  False,
+                  'stop_words': False,
+                  'punctuation': False,
+                  'only_ascii': False,
+                  'lemmatization': False
+                 }
+```
+%% Cell type:code id: tags:
+``` python
+sts12 = loadSTS12(PATH_DATASETS, PREPROCESSING)
+sts13 = loadSTS13(PATH_DATASETS, PREPROCESSING)
+sts14 = loadSTS14(PATH_DATASETS, PREPROCESSING)
+sts15 = loadSTS15(PATH_DATASETS, PREPROCESSING)
+```
+%% Output
+    ***** Transfer task : STS12 *****
+    Preprocessing -MSRpar-
+    -MSRpar- preprocessed correctly
+    Preprocessing -MSRvid-
+    -MSRvid- preprocessed correctly
+    Preprocessing -SMTeuroparl-
+    -SMTeuroparl- preprocessed correctly
+    Preprocessing -surprise.OnWN-
+    -surprise.OnWN- preprocessed correctly
+    Preprocessing -surprise.SMTnews-
+    -surprise.SMTnews- preprocessed correctly
+    ***** Transfer task : STS13 (-SMT) *****
+    Preprocessing -FNWN-
+    -FNWN- preprocessed correctly
+    Preprocessing -headlines-
+    -headlines- preprocessed correctly
+    Preprocessing -OnWN-
+    -OnWN- preprocessed correctly
+    ***** Transfer task : STS14 *****
+    Preprocessing -deft-forum-
+    -deft-forum- preprocessed correctly
+    Preprocessing -deft-news-
+    -deft-news- preprocessed correctly
+    Preprocessing -headlines-
+    -headlines- preprocessed correctly
+    Preprocessing -images-
+    -images- preprocessed correctly
+    Preprocessing -OnWN-
+    -OnWN- preprocessed correctly
+    Preprocessing -tweet-news-
+    -tweet-news- preprocessed correctly
+    ***** Transfer task : STS15 *****
+    Preprocessing -answers-forums-
+    -answers-forums- preprocessed correctly
+    Preprocessing -answers-students-
+    -answers-students- preprocessed correctly
+    Preprocessing -belief-
+    -belief- preprocessed correctly
+    Preprocessing -headlines-
+    -headlines- preprocessed correctly
+    Preprocessing -images-
+    -images- preprocessed correctly
+%% Cell type:markdown id: tags:
+## Métodos
+%% Cell type:code id: tags:
+``` python
+from functools import partial
+```
+%% Cell type:code id: tags:
+``` python
+METHODS = [
+    ("W2V + AVG", partial(avg_cosine, model=word2vec)),
+    ("W2V + WMD", partial(wmd, model=word2vec)),
+    ("GLOVE + AVG", partial(avg_cosine, model=glove)),
+    ("GLOVE + WMD", partial(wmd, model=glove)),
+    #("FASTTEXT + AVG", partial(avg_cosine, model=fasttext)),
+    #("FASTTEX + WMD", partial(wmd, model=fasttext))
+]
+```
+%% Cell type:code id: tags:
+``` python
+pearson, spearman = evaluate(sts12, METHODS)
+```
+%% Cell type:code id: tags:
+``` python
+pearson
+```
+%% Output
+    {'W2V + AVG': 0.5576731761229754,
+     'W2V + WMD': 0.4735133931943548,
+     'GLOVE + AVG': 0.550325345521787,
+     'GLOVE + WMD': 0.5511226507959358}
+%% Cell type:code id: tags:
+``` python
+```
--- a/load.py
+++ b/load.py
+import io
+import numpy as np
+from utils import preprocess
+def loadFile(path, datasets, preprocessing):
+    data = {}
+    for dataset in datasets:
+        # Load sentences pairs
+        sent1, sent2 = zip(*[l.split("\t") for l in
+                            io.open(path + '/STS.input.%s.txt' % dataset,
+                                    encoding='utf8').read().splitlines()])
+        # Load Gold Standard files (similarity scores)
+        raw_scores = np.array([x for x in
+                                io.open(path + '/STS.gs.%s.txt' % dataset,
+                                        encoding='utf8')
+                                .read().splitlines()])
+        # Consider only pairs with a score
+        not_empty_idx = raw_scores != ''
+        gs_scores = [float(x) for x in raw_scores[not_empty_idx]]
+        # Preprocess sentences
+        print("Preprocessing -{}-".format(dataset))
+        sent1 = preprocess(sent1, **preprocessing)[not_empty_idx]
+        sent2 = preprocess(sent2, **preprocessing)[not_empty_idx]
+        print("-{}- preprocessed correctly".format(dataset))
+        # Sort data by length to minimize padding in batcher
+        sorted_data = sorted(zip(sent1, sent2, gs_scores),
+                                key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        sent1, sent2, gs_scores = map(list, zip(*sorted_data))
+        data[dataset] = (sent1, sent2, gs_scores)
+    return data
+def loadSTS12(path, preprocessing ):
+    print('***** Transfer task : STS12 *****\n\n')
+    datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',
+                        'surprise.OnWN', 'surprise.SMTnews']
+    return loadFile('{}/STS12-en-test'.format(path), datasets, preprocessing)
+# STS13 here does not contain the "SMT" subtask due to LICENSE issue
+def loadSTS13(path, preprocessing):
+    print('***** Transfer task : STS13 (-SMT) *****\n\n')
+    datasets = ['FNWN', 'headlines', 'OnWN']
+    return loadFile('{}/STS13-en-test'.format(path), datasets, preprocessing)
+def loadSTS14(path, preprocessing):
+    print('***** Transfer task : STS14 *****\n\n')
+    datasets = ['deft-forum', 'deft-news', 'headlines',
+                        'images', 'OnWN', 'tweet-news']
+    return loadFile('{}/STS14-en-test'.format(path), datasets, preprocessing)
+def loadSTS15(path, preprocessing):
+    print('***** Transfer task : STS15 *****\n\n')
+    datasets = ['answers-forums', 'answers-students',
+                        'belief', 'headlines', 'images']
+    return loadFile('{}/STS15-en-test'.format(path), datasets, preprocessing)
+def loadSTS16(path, preprocessing):
+    print('***** Transfer task : STS16 *****\n\n')
+    datasets = ['answer-answer', 'headlines', 'plagiarism',
+                        'postediting', 'question-question']
+    return loadFile('{}/STS16-en-test'.format(path), datasets, preprocessing)
--- a/methods.py
+++ b/methods.py
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.decomposition import TruncatedSVD
+def avg_cosine(sentences1, sentences2, model):
+    """ """
+    sims = []
+    for (sent1, sent2) in zip(sentences1, sentences2):
+        tokens1 = list(filter(lambda token: token in model, sent1))
+        tokens2 = list(filter(lambda token: token in model, sent2))      
+        if len(tokens1) == 0 or len(tokens2) == 0:
+            sims.append(0)
+        else:    
+            embedding1 = np.average(list(map(lambda token: model[token], tokens1)), axis=0).reshape(1, -1)
+            embedding2 = np.average(list(map(lambda token: model[token], tokens2)), axis=0).reshape(1, -1)
+            sim = cosine_similarity(embedding1, embedding2)[0][0]
+            sims.append(sim)
+    return sims
+def wmd(sentences1, sentences2, model):
+    """ """ 
+    sims = []
+    for (sent1, sent2) in zip(sentences1, sentences2):
+        tokens1 = list(filter(lambda token: token in model, sent1))
+        tokens2 = list(filter(lambda token: token in model, sent2)) 
+        d = -model.wmdistance(tokens1, tokens2)
+        sims.append(max(d,-100))
+    return sims
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
+import numpy as np 
+import spacy
+from scipy.stats import spearmanr, pearsonr
+def preprocess(sentences, lowercase=True, stop_words=True, punctuation=True,
+                                                 only_ascii=True, lemmatization=True):
+    """ Preprocesses the given sentences applying the specified filters 
+        and extracting the tokens that verify those filters """
+    nlp = spacy.load("en_core_web_sm")
+    preprocessed_sentences = []  
+    for doc in nlp.pipe(sentences, disable=["tagger", "parser", "ner"]):
+        tokens = doc.doc
+        if stop_words: 
+            tokens =  list(filter(lambda t : not t.is_stop, tokens))
+        if punctuation:
+            tokens = list(filter(lambda t: not t.is_punct, tokens))
+        if only_ascii:
+            tokens = list(filter(lambda t: t.is_ascii, tokens))
+        if lemmatization: 
+            tokens =  list(map(lambda t: t.lemma_, tokens))
+        else:
+            tokens = list(map(lambda t: t.text, tokens))
+        if lowercase:
+            tokens = list(map(lambda t: t.lower(), tokens))
+        preprocessed_sentences.append(np.array(tokens))
+    return np.array(preprocessed_sentences)
+def evaluate(task, methods):
+    """ Computes the weigthed Pearson and Spearman correlations of a task 
+        using the specified methods"""
+    pearson_correlations = {}
+    spearman_correlations = {}
+    for label, method in evaluations:
+        task_pearson = []
+        task_spearman = []
+        task_weights = [] 
+        for dataset in task.keys():
+            sentences1, sentences2, gs = task[dataset]
+            task_weights.append(len(gs))
+            sims = method(sentences1, sentences2)
+            task_pearson.append(pearsonr(sims, gs)[0])
+            task_spearman.append(spearmanr(sims, gs)[0])
+        wpearson = sum(task_pearson[i] * task_weights[i] / sum(task_weights) for i in range(len(task_weights)))
+        wspearman =  sum(task_spearman[i] * task_weights[i] / sum(task_weights) for i in range(len(task_weights)))
+        pearson_correlations[label] = wpearson
+        spearman_correlations[label] = wspearman
+    return pearson_correlations, spearman_correlations
\ No newline at end of file