-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_hearst_patterns.py
50 lines (42 loc) · 1.28 KB
/
extract_hearst_patterns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import spacy
from spacy.tokens import Doc
from spacy.matcher import Matcher
from aochildes.dataset import AOChildesDataSet
transcripts = AOChildesDataSet().load_transcripts()
nlp = spacy.load("en_core_web_sm", exclude=['ner', 'parser'])
matcher = Matcher(nlp.vocab)
pattern1 = [{'POS': 'NOUN'},
{'LOWER': 'and'},
{'LOWER': 'other'},
{'POS': 'NOUN'},
]
pattern2 = [{'POS': 'NOUN'},
{'IS_PUNCT': True, 'OP': '?'},
{'LOWER': 'especially'},
{'POS': 'NOUN'}]
pattern3 = [{'POS': 'NOUN'},
{'IS_PUNCT': True, 'OP': '?'},
{'LOWER': 'including'},
{'POS': 'NOUN'}]
pattern4 = [{'POS': 'NOUN'},
{'LOWER': 'or'},
{'LOWER': 'other'},
{'POS': 'NOUN'}]
pattern5 = [{'POS': 'NOUN'},
{'IS_PUNCT': True, 'OP': '?'},
{'LOWER': 'such'},
{'LOWER': 'as'},
{'POS': 'NOUN'}]
matcher.add("hypernyn-hyponym",
[pattern1,
pattern2,
pattern3,
pattern4,
pattern5,
])
for doc in nlp.pipe(transcripts, n_process=4):
doc: Doc
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(span.text)