-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsklearnLogit.py
77 lines (56 loc) · 1.7 KB
/
sklearnLogit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# Read dataframe and drop NaN
df = pd.read_csv('mail_list.csv', delimiter=',',names=["Subj" , "Body" , "Spam"])
df.dropna()
# Vectorize
vectorizer = TfidfVectorizer()
subj_mat = vectorizer.fit_transform(df["Subj"].values.astype('U'))
body_mat = vectorizer.fit_transform(df["Body"].values.astype('U'))
joined_matrix = subj_mat + body_mat
print (joined_matrix)
# Split df to train and test
X_train, X_test, Y_train, y_test = train_test_split(joined_matrix,df["Spam"])
# Logit Clasifier + Train Model
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
predictions = classifier.predict(X_test)
print ("Made ", len(predictions) , "predictions")
print(accuracy_score(y_test,predictions))
"""
xt = []
for x in X_train:
s = 0
for y in x:
s += y
xt.append(s)
plt.scatter(xt,Y_train)
plt.title("Logistic Regression")
plt.xlabel('Mean Vecfloat')
plt.ylabel('Spam (1:Spam, 0:Ham)')
plt.show()
"""
"""
with open("mail_list.csv" ,"w+" ) as f:
csv_writer = csv.writer(f , delimiter=",")
for m in Messages:
row = []
subj = ''
body = ''
for w in m.Subject:
subj = subj + ' ' + w
for w in m.Subject:
body = body + ' ' + w
spam = m.Spam
if spam is True : spam = 1
else: spam = 0
row.append(subj)
row.append(body)
row.append(spam)
csv_writer.writerow(row)
"""