Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
semantic_similarity
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
pabvald
semantic_similarity
Commits
c583fca6
Commit
c583fca6
authored
Feb 27, 2020
by
pabvald
Browse files
Options
Downloads
Patches
Plain Diff
SICK dataset added
parent
2c8043df
No related branches found
No related tags found
No related merge requests found
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
Untitled.ipynb
+0
-85
0 additions, 85 deletions
Untitled.ipynb
evaluation.ipynb
+90
-52
90 additions, 52 deletions
evaluation.ipynb
load.py
+51
-12
51 additions, 12 deletions
load.py
results.ods
+0
-0
0 additions, 0 deletions
results.ods
with
141 additions
and
149 deletions
Untitled.ipynb
deleted
100644 → 0
+
0
−
85
View file @
2c8043df
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from load import load_file\n",
"import os\n",
"import io"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def load_SICK(path, verbose=False):\n",
" if verbose:\n",
" print('***** Task: SICK-Relatedness*****\\n\\n')\n",
" sick_train = load_file_SICK(os.path.join(path, 'SICK_train.txt'))\n",
" sick_dev = load_file_SICK(os.path.join(path, 'SICK_trial.txt'))\n",
" sick_test = load_file_SICK(os.path.join(path, 'SICK_test_annotated.txt'))\n",
" \n",
" return sick_train\n",
"\n",
"def load_file_SICK(path):\n",
" skipFirstLine = True\n",
" sick_data = {'sent_1': [], 'sent_2': [], 'sim': []}\n",
" with io.open(path, 'r', encoding='utf-8') as f:\n",
" for line in f:\n",
" if skipFirstLine:\n",
" skipFirstLine = False\n",
" else:\n",
" text = line.strip().split('\\t')\n",
" sick_data['sent_1'].append(text[1].split())\n",
" sick_data['sent_2'].append(text[2].split())\n",
" sick_data['sim'].append(text[3])\n",
" sick_data['sim'] = [float(s) for s in sick_data['sim']]\n",
" return sick_data\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"a = load_SICK('./data/datasets/SICK/')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
%% Cell type:code id: tags:
```
python
from
load
import
load_file
import
os
import
io
```
%% Cell type:code id: tags:
```
python
def
load_SICK
(
path
,
verbose
=
False
):
if
verbose
:
print
(
'
***** Task: SICK-Relatedness*****
\n\n
'
)
sick_train
=
load_file_SICK
(
os
.
path
.
join
(
path
,
'
SICK_train.txt
'
))
sick_dev
=
load_file_SICK
(
os
.
path
.
join
(
path
,
'
SICK_trial.txt
'
))
sick_test
=
load_file_SICK
(
os
.
path
.
join
(
path
,
'
SICK_test_annotated.txt
'
))
return
sick_train
def
load_file_SICK
(
path
):
skipFirstLine
=
True
sick_data
=
{
'
sent_1
'
:
[],
'
sent_2
'
:
[],
'
sim
'
:
[]}
with
io
.
open
(
path
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
f
:
for
line
in
f
:
if
skipFirstLine
:
skipFirstLine
=
False
else
:
text
=
line
.
strip
().
split
(
'
\t
'
)
sick_data
[
'
sent_1
'
].
append
(
text
[
1
].
split
())
sick_data
[
'
sent_2
'
].
append
(
text
[
2
].
split
())
sick_data
[
'
sim
'
].
append
(
text
[
3
])
sick_data
[
'
sim
'
]
=
[
float
(
s
)
for
s
in
sick_data
[
'
sim
'
]]
return
sick_data
```
%% Cell type:code id: tags:
```
python
a
=
load_SICK
(
'
./data/datasets/SICK/
'
)
```
%% Cell type:code id: tags:
```
python
```
This diff is collapsed.
Click to expand it.
evaluation.ipynb
+
90
−
52
View file @
c583fca6
This diff is collapsed.
Click to expand it.
load.py
+
51
−
12
View file @
c583fca6
...
...
@@ -14,6 +14,7 @@
import
io
import
os
import
csv
import
numpy
as
np
from
utils
import
preprocess
...
...
@@ -29,8 +30,8 @@ def load_frequencies(path):
return
frequencies
def
load_file
(
path
,
datasets
,
preprocessing
,
verbose
=
False
):
"""
Loads a
nd
STS file and pre
-
processes its sentences
"""
def
load_file
_STS
(
path
,
datasets
,
preprocessing
,
verbose
=
False
):
"""
Loads a STS
test
file and preprocesses its sentences
"""
data
=
{}
for
dataset
in
datasets
:
...
...
@@ -65,43 +66,81 @@ def load_file(path, datasets, preprocessing, verbose=False):
return
data
def
load_SICK
(
path
,
preprocessing
,
verbose
=
False
):
"""
Loads the SICK train, dev and test files and preprocess its sentences
"""
if
verbose
:
print
(
'
\n\n
***** Task: SICK-Relatedness*****
\n
'
)
sick_all
=
{}
sick_train
=
load_file_SICK
(
os
.
path
.
join
(
path
,
'
SICK_train.txt
'
),
preprocessing
)
sick_dev
=
load_file_SICK
(
os
.
path
.
join
(
path
,
'
SICK_trial.txt
'
),
preprocessing
)
sick_test
=
load_file_SICK
(
os
.
path
.
join
(
path
,
'
SICK_test_annotated.txt
'
),
preprocessing
)
sick_all
[
'
train
'
]
=
sick_train
sick_all
[
'
test
'
]
=
sick_test
sick_all
[
'
dev
'
]
=
sick_dev
return
sick_all
def
load_file_SICK
(
path
,
preprocessing
):
"""
Loads a SICK file and preprocess its sentences
"""
skipFirstLine
=
True
sent1
=
[]
sent2
=
[]
sim
=
[]
# Read file
with
io
.
open
(
path
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
f
:
for
line
in
f
:
if
skipFirstLine
:
skipFirstLine
=
False
else
:
text
=
line
.
strip
().
split
(
'
\t
'
)
sent1
.
append
(
text
[
1
])
sent2
.
append
(
text
[
2
])
sim
.
append
(
text
[
3
])
sent1
=
preprocess
(
sent1
,
**
preprocessing
)
sent2
=
preprocess
(
sent2
,
**
preprocessing
)
sim
=
[
float
(
s
)
for
s
in
sim
]
return
(
sent1
,
sent2
,
sim
)
def
load_sts_12
(
path
,
preprocessing
,
verbose
=
False
):
"""
Loads the SemEval-2012
'
s Semantic Textual Similarity task
"""
if
verbose
:
print
(
'
\n
***** TASK: STS12 *****
\n
'
)
print
(
'
\n
\n
***** TASK: STS12 *****
\n
'
)
datasets
=
[
'
MSRpar
'
,
'
MSRvid
'
,
'
SMTeuroparl
'
,
'
surprise.OnWN
'
,
'
surprise.SMTnews
'
]
return
load_file
(
'
{}/STS12-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
return
load_file
_STS
(
'
{}/STS12-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
def
load_sts_13
(
path
,
preprocessing
,
verbose
=
False
):
"""
Loads the SemEval-2013
'
s Semantic Textual Similarity task
"""
# STS13 here does not contain the "SMT" subtask due to LICENSE issue
if
verbose
:
print
(
'
\n
***** TASK: STS13 (-SMT) ***
\n
\n
'
)
print
(
'
\n
\n
***** TASK: STS13 (-SMT) ***
\n
'
)
datasets
=
[
'
FNWN
'
,
'
headlines
'
,
'
OnWN
'
]
return
load_file
(
'
{}/STS13-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
return
load_file
_STS
(
'
{}/STS13-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
def
load_sts_14
(
path
,
preprocessing
,
verbose
=
False
):
"""
Loads the SemEval-2014
'
s Semantic Textual Similarity task
"""
if
verbose
:
print
(
'
\n
***** TASK: STS14 *****
\n
'
)
print
(
'
\n
\n
***** TASK: STS14 *****
\n
'
)
datasets
=
[
'
deft-forum
'
,
'
deft-news
'
,
'
headlines
'
,
'
images
'
,
'
OnWN
'
,
'
tweet-news
'
]
return
load_file
(
'
{}/STS14-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
return
load_file
_STS
(
'
{}/STS14-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
def
load_sts_15
(
path
,
preprocessing
,
verbose
=
False
):
"""
Loads the SemEval-2015
'
s Semantic Textual Similarity task
"""
if
verbose
:
print
(
'
\n
***** TASK: STS15 *****
\n
'
)
print
(
'
\n
\n
***** TASK: STS15 *****
\n
'
)
datasets
=
[
'
answers-forums
'
,
'
answers-students
'
,
'
belief
'
,
'
headlines
'
,
'
images
'
]
return
load_file
(
'
{}/STS15-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
return
load_file
_STS
(
'
{}/STS15-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
def
load_sts_16
(
path
,
preprocessing
,
verbose
=
False
):
"""
Loads the SemEval-2016
'
s Semantic Textual Similarity task
"""
if
verbose
:
print
(
'
\n
***** TASK: STS16 *****
\n
'
)
print
(
'
\n
\n
***** TASK: STS16 *****
\n
'
)
datasets
=
[
'
answer-answer
'
,
'
headlines
'
,
'
plagiarism
'
,
'
postediting
'
,
'
question-question
'
]
return
load_file
(
'
{}/STS16-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
return
load_file
_STS
(
'
{}/STS16-en-test
'
.
format
(
path
),
datasets
,
preprocessing
,
verbose
=
verbose
)
This diff is collapsed.
Click to expand it.
results.ods
+
0
−
0
View file @
c583fca6
No preview for this file type
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
sign in
to comment