-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarizer.py
218 lines (172 loc) · 8.34 KB
/
summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# Gensim Imports
from gensim.summarization.summarizer import summarize
# Spacy Imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
# NLTK Imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
# Sumy Imports
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
# Other Imports
from string import punctuation
from heapq import nlargest
def gensim_summarize(text_content, percent):
# TextRank Summarization using Gensim Library.
# Split is false, gensim return strings joined by "\n". if true, gensim will return list
summary = summarize(text_content, ratio=(int(percent) / 100), split=False).replace("\n", " ")
# Returning NLTK Summarization Output
return summary
def spacy_summarize(text_content, percent):
# Frequency Based Summarization using Spacy.
# Build a List of Stopwords
stop_words = list(STOP_WORDS)
# import punctuations from strings library.
punctuation_items = punctuation + '\n'
# Loading en_core_web_sm in Spacy
nlp = spacy.load('en_core_web_sm')
# Build an NLP Object
nlp_object = nlp(text_content)
# Create the dictionary with key as words and value as number of times word is repeated.
# Scoring words by its occurrence.
word_frequencies = {}
for word in nlp_object:
if word.text.lower() not in stop_words:
if word.text.lower() not in punctuation_items:
if word.text not in word_frequencies.keys():
word_frequencies[word.text] = 1
else:
word_frequencies[word.text] += 1
# Finding frequency of most occurring word
max_frequency = max(word_frequencies.values())
# Divide Number of occurrences of all words by the max_frequency
for word in word_frequencies.keys():
word_frequencies[word] = word_frequencies[word] / max_frequency
# Save a sentence-tokenized copy of text
sentence_token = [sentence for sentence in nlp_object.sents]
# Create the dictionary with key as sentences and value as sum of each important word.
# Scoring sentences by its words.
sentence_scores = {}
for sent in sentence_token:
sentence = sent.text.split(" ")
for word in sentence:
if word.lower() in word_frequencies.keys():
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word.lower()]
else:
sentence_scores[sent] += word_frequencies[word.lower()]
# Finding number of sentences and applying percentage on it: since we require to show most X% lines in summary.
select_length = int(len(sentence_token) * (int(percent) / 100))
# Using nlargest library to get the top x% weighted sentences.
summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
# Later joining it to get the final summarized text.
final_summary = [word.text for word in summary]
summary = ' '.join(final_summary)
# Returning NLTK Summarization Output
return summary
def nltk_summarize(text_content, percent):
# Frequency Based Summarization using NLTK
# Store a tokenized copy of text, using NLTK's recommended word tokenizer
tokens = word_tokenize(text_content)
# Import the stop words from NLTK toolkit
stop_words = stopwords.words('english')
# import punctuations from strings library.
punctuation_items = punctuation + '\n'
# Create the dictionary with key as words and value as number of times word is repeated.
# Scoring words by its occurrence.
word_frequencies = {}
for word in tokens:
if word.lower() not in stop_words:
if word.lower() not in punctuation_items:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
# Finding frequency of most occurring word
max_frequency = max(word_frequencies.values())
# Divide Number of occurrences of all words by the max_frequency
for word in word_frequencies.keys():
word_frequencies[word] = word_frequencies[word] / max_frequency
# Save a sentence-tokenized copy of text
sentence_token = sent_tokenize(text_content)
# Create the dictionary with key as sentences and value as sum of each important word.
# Scoring sentences by its words.
sentence_scores = {}
for sent in sentence_token:
sentence = sent.split(" ")
for word in sentence:
if word.lower() in word_frequencies.keys():
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word.lower()]
else:
sentence_scores[sent] += word_frequencies[word.lower()]
# Finding number of sentences and applying percentage on it: since we require to show most X% lines in summary.
select_length = int(len(sentence_token) * (int(percent) / 100))
# Using nlargest library to get the top x% weighted sentences.
summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
# Later joining it to get the final summarized text.
final_summary = [word for word in summary]
summary = ' '.join(final_summary)
# Returning NLTK Summarization Output
return summary
def sumy_lsa_summarize(text_content, percent):
# Latent Semantic Analysis is a unsupervised learning algorithm that can be used for extractive text summarization.
# Initializing the parser
parser = PlaintextParser.from_string(text_content, Tokenizer("english"))
# Initialize the stemmer
stemmer = Stemmer('english')
# Initializing the summarizer
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words('english')
# Finding number of sentences and applying percentage on it: since sumy requires number of lines
sentence_token = sent_tokenize(text_content)
select_length = int(len(sentence_token) * (int(percent) / 100))
# Evaluating and saving the Summary
summary = ""
for sentence in summarizer(parser.document, sentences_count=select_length):
summary += str(sentence)
# Returning NLTK Summarization Output
return summary
def sumy_luhn_summarize(text_content, percent):
# A naive approach based on TF-IDF and looking at the “window size” of non-important words between words of high
# importance. It also assigns higher weights to sentences occurring near the beginning of a document.
# Initializing the parser
parser = PlaintextParser.from_string(text_content, Tokenizer("english"))
# Initialize the stemmer
stemmer = Stemmer('english')
# Initializing the summarizer
summarizer = LuhnSummarizer(stemmer)
summarizer.stop_words = get_stop_words('english')
# Finding number of sentences and applying percentage on it: since sumy requires number of lines
sentence_token = sent_tokenize(text_content)
select_length = int(len(sentence_token) * (int(percent) / 100))
# Evaluating and saving the Summary
summary = ""
for sentence in summarizer(parser.document, sentences_count=select_length):
summary += str(sentence)
# Returning NLTK Summarization Output
return summary
def sumy_text_rank_summarize(text_content, percent):
# TextRank is an unsupervised text summarization technique that uses the intuition behind the PageRank algorithm.
# Initializing the parser
parser = PlaintextParser.from_string(text_content, Tokenizer("english"))
# Initialize the stemmer
stemmer = Stemmer('english')
# Initializing the summarizer
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words('english')
# Finding number of sentences and applying percentage on it: since sumy requires number of lines
sentence_token = sent_tokenize(text_content)
select_length = int(len(sentence_token) * (int(percent) / 100))
# Evaluating and saving the Summary
summary = ""
for sentence in summarizer(parser.document, sentences_count=select_length):
summary += str(sentence)
# Returning NLTK Summarization Output
return summary