New notebook (2)
In [109]:
Copied!
!pip install nltk numpy pandas scikit-learn seaborn
!pip install nltk numpy pandas scikit-learn seaborn
Requirement already satisfied: nltk in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (3.8.1) Requirement already satisfied: numpy in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (1.26.0) Requirement already satisfied: pandas in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (2.1.1) Requirement already satisfied: scikit-learn in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (1.3.1) Requirement already satisfied: seaborn in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (0.12.2) Requirement already satisfied: click in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from nltk) (8.1.7) Requirement already satisfied: joblib in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from nltk) (1.3.2) Requirement already satisfied: regex>=2021.8.3 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from nltk) (2023.8.8) Requirement already satisfied: tqdm in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from nltk) (4.66.1) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from pandas) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from pandas) (2023.3) Requirement already satisfied: scipy>=1.5.0 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from scikit-learn) (1.11.3) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from scikit-learn) (3.2.0) Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from seaborn) (3.8.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.1.1) Requirement already satisfied: cycler>=0.10 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.42.1) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.5) Requirement already satisfied: packaging>=20.0 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (10.0.1) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.1.1) Requirement already satisfied: six>=1.5 in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) Requirement already satisfied: colorama in c:\users\administrator\appdata\local\programs\python\python311\lib\site-packages (from click->nltk) (0.4.6)
[notice] A new release of pip available: 22.3.1 -> 23.2.1 [notice] To update, run: python.exe -m pip install --upgrade pip
In [110]:
Copied!
import os
import numpy as np
import pandas as pd
import math
import csv
import random
from pandas.core.frame import DataFrame
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import heapq
import string
import seaborn as sns
import matplotlib.pyplot as plt
import re
from pandas.core.frame import DataFrame
import os
import numpy as np
import pandas as pd
import math
import csv
import random
from pandas.core.frame import DataFrame
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import heapq
import string
import seaborn as sns
import matplotlib.pyplot as plt
import re
from pandas.core.frame import DataFrame
In [111]:
Copied!
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Administrator\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\Administrator\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\Administrator\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
Out[111]:
True
In [112]:
Copied!
datapd = pd.read_csv("train.csv")
datapd = pd.read_csv("train.csv")
In [ ]:
Copied!
In [113]:
Copied!
# plt.figure(figsize=(18, 5)) #adjust the size of plot
value_counts = datapd['target'].value_counts()
sns.barplot(x=value_counts.index, y=value_counts.values)
plt.xlabel('Target')
plt.ylabel('Count')
plt.title('Value Counts of Target Column')
plt.show()
# plt.figure(figsize=(18, 5)) #adjust the size of plot
value_counts = datapd['target'].value_counts()
sns.barplot(x=value_counts.index, y=value_counts.values)
plt.xlabel('Target')
plt.ylabel('Count')
plt.title('Value Counts of Target Column')
plt.show()
C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector): C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead if pd.api.types.is_categorical_dtype(vector):
In [ ]:
Copied!
In [ ]:
Copied!
In [114]:
Copied!
def read_data(path):
'''
read nlp data
'''
file = open(path, "r", encoding='UTF-8')
reader = csv.reader(file)
data = []
for line in reader:
if reader.line_num == 1:
continue
data.append(line)
return data
def read_data(path):
'''
read nlp data
'''
file = open(path, "r", encoding='UTF-8')
reader = csv.reader(file)
data = []
for line in reader:
if reader.line_num == 1:
continue
data.append(line)
return data
In [115]:
Copied!
data = read_data("train.csv")
test_data = read_data("test.csv")
data = read_data("train.csv")
test_data = read_data("test.csv")
In [116]:
Copied!
ratio = 0.7
train_set = []
len_train = 0
len_total = len(data)
indexes = []
removed_set = set()
while len_train < int(0.7 * len_total):
index = int(random.random() * len(data))
while (index in removed_set):
index = int(random.random() * len(data))
removed_set.add(index)
indexes.append(data[index])
len_train += 1
for i in indexes:
train_set.append(i)
data.remove(i)
dev_set = np.array(data)
train_set = np.array(train_set)
dev = {"id": dev_set[:, 0].tolist(),
"keyword": dev_set[:, 1].tolist(),
"location": dev_set[:, 2].tolist(),
"text": dev_set[:, 3].tolist(),
"target": dev_set[:, 4].tolist()}
train = {"id": train_set[:, 0].tolist(),
"keyword": train_set[:, 1].tolist(),
"location": train_set[:, 2].tolist(),
"text": train_set[:, 3].tolist(),
"target": train_set[:, 4].tolist()}
path = "./"
t = DataFrame(train)
d = DataFrame(dev)
d.to_csv(path + "dev_set.csv", index=False)
t.to_csv(path + "train_set.csv", index=False)
ratio = 0.7
train_set = []
len_train = 0
len_total = len(data)
indexes = []
removed_set = set()
while len_train < int(0.7 * len_total):
index = int(random.random() * len(data))
while (index in removed_set):
index = int(random.random() * len(data))
removed_set.add(index)
indexes.append(data[index])
len_train += 1
for i in indexes:
train_set.append(i)
data.remove(i)
dev_set = np.array(data)
train_set = np.array(train_set)
dev = {"id": dev_set[:, 0].tolist(),
"keyword": dev_set[:, 1].tolist(),
"location": dev_set[:, 2].tolist(),
"text": dev_set[:, 3].tolist(),
"target": dev_set[:, 4].tolist()}
train = {"id": train_set[:, 0].tolist(),
"keyword": train_set[:, 1].tolist(),
"location": train_set[:, 2].tolist(),
"text": train_set[:, 3].tolist(),
"target": train_set[:, 4].tolist()}
path = "./"
t = DataFrame(train)
d = DataFrame(dev)
d.to_csv(path + "dev_set.csv", index=False)
t.to_csv(path + "train_set.csv", index=False)
In [117]:
Copied!
train_set = read_data("train_set.csv")
dev_set = read_data("dev_set.csv")
train_set = read_data("train_set.csv")
dev_set = read_data("dev_set.csv")
In [118]:
Copied!
def preprocess_data(data):
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
punc = string.punctuation
stop_words = set(stopwords.words('english'))
for i in range(len(data)):
data[i][3] = data[i][3].lower()
data[i][3] = word_tokenize(data[i][3])
delete_list = []
process(data, delete_list, i, lemmatizer, punc, stop_words)
return data
def process(data, delete_list, i, lemmatizer, punc, stop_words):
for j, word in enumerate(data[i][3]):
processword(delete_list, punc, stop_words, word)
removewords(data, delete_list, i)
for j, word in enumerate(data[i][3]):
processed_word = lemmatizer.lemmatize(word, pos="v")
data[i][3][j] = processed_word
def removewords(data, delete_list, i):
for word in delete_list:
data[i][3].remove(word)
def processword(delete_list, punc, stop_words, word):
if word.startswith("//t"):
delete_list.append(word)
elif word.startswith("http"):
delete_list.append(word)
elif word in punc:
delete_list.append(word)
elif word.isalpha() == False:
delete_list.append(word)
elif word in stop_words:
delete_list.append(word)
def preprocess_data(data):
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
punc = string.punctuation
stop_words = set(stopwords.words('english'))
for i in range(len(data)):
data[i][3] = data[i][3].lower()
data[i][3] = word_tokenize(data[i][3])
delete_list = []
process(data, delete_list, i, lemmatizer, punc, stop_words)
return data
def process(data, delete_list, i, lemmatizer, punc, stop_words):
for j, word in enumerate(data[i][3]):
processword(delete_list, punc, stop_words, word)
removewords(data, delete_list, i)
for j, word in enumerate(data[i][3]):
processed_word = lemmatizer.lemmatize(word, pos="v")
data[i][3][j] = processed_word
def removewords(data, delete_list, i):
for word in delete_list:
data[i][3].remove(word)
def processword(delete_list, punc, stop_words, word):
if word.startswith("//t"):
delete_list.append(word)
elif word.startswith("http"):
delete_list.append(word)
elif word in punc:
delete_list.append(word)
elif word.isalpha() == False:
delete_list.append(word)
elif word in stop_words:
delete_list.append(word)
In [119]:
Copied!
total_processed_data = preprocess_data(data)
processed_data = preprocess_data(train_set)
processed_dev = preprocess_data(dev_set)
total_processed_data = preprocess_data(data)
processed_data = preprocess_data(train_set)
processed_dev = preprocess_data(dev_set)
In [120]:
Copied!
def word_bags(data):
words = []
words_set = set()
label = []
processlable(data, label, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10)
X_train = count_vect.fit_transform(words)
return count_vect
def processlable(data, label, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
label.append([data[i][4]])
def word_bags(data):
words = []
words_set = set()
label = []
processlable(data, label, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10)
X_train = count_vect.fit_transform(words)
return count_vect
def processlable(data, label, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
label.append([data[i][4]])
In [121]:
Copied!
bags = word_bags(processed_data)
bags
bags = word_bags(processed_data)
bags
Out[121]:
CountVectorizer(binary=True, min_df=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CountVectorizer(binary=True, min_df=10)
In [122]:
Copied!
def trans(x, X, count_vect):
train = x
X_train = []
for i in range(len(train)):
X_train.append([])
trans_process(X, X_train, count_vect, i, train)
return X_train
def trans_process(X, X_train, count_vect, i, train):
for j in range(X.shape[1]):
X_train[i].append(0)
for j in train[i][3]:
index = count_vect.vocabulary_.get(j)
if index == None:
continue
X_train[i][index] = 1
def naive_bayes(data, train, dev):
data = train
words = []
words_set = set()
label = []
bayes_preprocess(data, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10)
X = count_vect.fit_transform(words).toarray()
X_train = trans(train, X, count_vect)
X_train = np.array(X_train)
n = X_train.shape[0]
d = X_train.shape[1]
K = 2
dev_data, dev_label, phis, psis = nb_processlable(K, X, X_train, count_vect, d, dev, label, n, train)
def nb_predictions(data, label, psis, phis):
label = label
x = data
n, d = x.shape
x = np.reshape(x, (1, n, d))
psis = np.reshape(psis, (K, 1, d))
psis = psis.clip(1e-14, 1 - 1e-14)
logpy = np.log(phis).reshape([K, 1])
logpxy = x * np.log(psis) + (1 - x) * np.log(1 - psis)
logpyx = logpxy.sum(axis=2) + logpy
return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K, n]), label
idx, logpyx, dev_label = nb_predictions(dev_data, dev_label, psis, phis)
F1 = f1_score(dev_label, idx)
print(F1)
return F1
def nb_processlable(K, X, X_train, count_vect, d, dev, label, n, train):
# shapes of parameters
psis = np.zeros([K, d])
phis = np.zeros([K])
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
for k in range(K):
X_k = X_train[label == k]
psis[k] = np.mean(X_k, axis=0)
phis[k] = X_k.shape[0] / float(n)
dev_label = []
for i in range(len(dev)):
dev_label.append(int(dev[i][4]))
dev_label = np.array(dev_label)
dev_data = trans(dev, X, count_vect)
dev_data = np.array(dev_data)
return dev_data, dev_label, phis, psis
def bayes_preprocess(data, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
def trans(x, X, count_vect):
train = x
X_train = []
for i in range(len(train)):
X_train.append([])
trans_process(X, X_train, count_vect, i, train)
return X_train
def trans_process(X, X_train, count_vect, i, train):
for j in range(X.shape[1]):
X_train[i].append(0)
for j in train[i][3]:
index = count_vect.vocabulary_.get(j)
if index == None:
continue
X_train[i][index] = 1
def naive_bayes(data, train, dev):
data = train
words = []
words_set = set()
label = []
bayes_preprocess(data, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10)
X = count_vect.fit_transform(words).toarray()
X_train = trans(train, X, count_vect)
X_train = np.array(X_train)
n = X_train.shape[0]
d = X_train.shape[1]
K = 2
dev_data, dev_label, phis, psis = nb_processlable(K, X, X_train, count_vect, d, dev, label, n, train)
def nb_predictions(data, label, psis, phis):
label = label
x = data
n, d = x.shape
x = np.reshape(x, (1, n, d))
psis = np.reshape(psis, (K, 1, d))
psis = psis.clip(1e-14, 1 - 1e-14)
logpy = np.log(phis).reshape([K, 1])
logpxy = x * np.log(psis) + (1 - x) * np.log(1 - psis)
logpyx = logpxy.sum(axis=2) + logpy
return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K, n]), label
idx, logpyx, dev_label = nb_predictions(dev_data, dev_label, psis, phis)
F1 = f1_score(dev_label, idx)
print(F1)
return F1
def nb_processlable(K, X, X_train, count_vect, d, dev, label, n, train):
# shapes of parameters
psis = np.zeros([K, d])
phis = np.zeros([K])
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
for k in range(K):
X_k = X_train[label == k]
psis[k] = np.mean(X_k, axis=0)
phis[k] = X_k.shape[0] / float(n)
dev_label = []
for i in range(len(dev)):
dev_label.append(int(dev[i][4]))
dev_label = np.array(dev_label)
dev_data = trans(dev, X, count_vect)
dev_data = np.array(dev_data)
return dev_data, dev_label, phis, psis
def bayes_preprocess(data, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
In [123]:
Copied!
naive_bayes_F1 = naive_bayes(total_processed_data, processed_data, processed_dev)
naive_bayes_F1 = naive_bayes(total_processed_data, processed_data, processed_dev)
0.7176151761517614
In [124]:
Copied!
def logistic(data, train, dev):
data = train
words = []
words_set = set()
label = []
logistic_preprocess(data, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10)
X = count_vect.fit_transform(words).toarray()
X_train, dev_data, dev_label, label = logistic_processlable(X, count_vect, dev, label, train)
logreg = LogisticRegression(C=1e5, multi_class='multinomial', verbose=True, max_iter=300)
logreg.fit(X_train, label)
dev_predict = logreg.predict(dev_data)
F1 = f1_score(dev_label, dev_predict)
print(F1)
influential_words = logistic_get_influential(count_vect, logreg)
return influential_words, F1
def logistic_get_influential(count_vect, logreg):
coef = logreg.coef_
coef = [abs(i) for i in coef[0]]
max_num_index_list = map(coef.index, heapq.nlargest(10, coef))
max_num_index_list = list(max_num_index_list)
import_words = set()
most_ = max_num_index_list
for i in count_vect.vocabulary_.keys():
if count_vect.vocabulary_[i] in most_:
import_words.add(i)
influential_words = []
for i in import_words:
influential_words.append(i)
print(influential_words)
return influential_words
def logistic_processlable(X, count_vect, dev, label, train):
X_train = trans(train, X, count_vect)
X_train = np.array(X_train)
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
dev_label = []
for i in range(len(dev)):
dev_label.append(int(dev[i][4]))
dev_label = np.array(dev_label)
dev_data = trans(dev, X, count_vect)
dev_data = np.array(dev_data)
return X_train, dev_data, dev_label, label
def logistic_preprocess(data, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
def logistic(data, train, dev):
data = train
words = []
words_set = set()
label = []
logistic_preprocess(data, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10)
X = count_vect.fit_transform(words).toarray()
X_train, dev_data, dev_label, label = logistic_processlable(X, count_vect, dev, label, train)
logreg = LogisticRegression(C=1e5, multi_class='multinomial', verbose=True, max_iter=300)
logreg.fit(X_train, label)
dev_predict = logreg.predict(dev_data)
F1 = f1_score(dev_label, dev_predict)
print(F1)
influential_words = logistic_get_influential(count_vect, logreg)
return influential_words, F1
def logistic_get_influential(count_vect, logreg):
coef = logreg.coef_
coef = [abs(i) for i in coef[0]]
max_num_index_list = map(coef.index, heapq.nlargest(10, coef))
max_num_index_list = list(max_num_index_list)
import_words = set()
most_ = max_num_index_list
for i in count_vect.vocabulary_.keys():
if count_vect.vocabulary_[i] in most_:
import_words.add(i)
influential_words = []
for i in import_words:
influential_words.append(i)
print(influential_words)
return influential_words
def logistic_processlable(X, count_vect, dev, label, train):
X_train = trans(train, X, count_vect)
X_train = np.array(X_train)
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
dev_label = []
for i in range(len(dev)):
dev_label.append(int(dev[i][4]))
dev_label = np.array(dev_label)
dev_data = trans(dev, X, count_vect)
dev_data = np.array(dev_data)
return X_train, dev_data, dev_label, label
def logistic_preprocess(data, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
In [125]:
Copied!
important_words, log_F1 = logistic(total_processed_data, processed_data, processed_dev)
important_words, log_F1 = logistic(total_processed_data, processed_data, processed_dev)
0.7028985507246377 ['debris', 'spill', 'knock', 'japan', 'bigger', 'temple', 'outbreak', 'israeli', 'wish', 'ebay']
C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
In [136]:
Copied!
def gram_trans(x, X, count_vect):
train = x
X_train = []
stop_words = count_vect.stop_words_
gram_trans_a(stop_words, train)
gram_trans_b(train)
gram_trans_c(X, X_train, count_vect, train)
return X_train
def gram_trans_c(X, X_train, count_vect, train):
for i in range(len(train)):
X_train.append([])
for j in range(X.shape[1]):
X_train[i].append(0)
for j in train[i][3]:
index = count_vect.vocabulary_.get(j)
if index == None:
continue
X_train[i][index] = 1
def gram_trans_b(train):
for i in range(len(train)):
temp_list = []
for j in range(0, len(train[i][3]) - 1):
temp_list.append(train[i][3][j] + " " + train[i][3][j + 1])
train[i][3] = temp_list
def gram_trans_a(stop_words, train):
for i in range(len(train)):
delete_list = set()
for j in train[i][3]:
if j in stop_words:
delete_list.add(j)
for j in delete_list:
train[i][3].remove(j)
def n_gram_naive_bayes(data, train, dev):
data = train
words = []
words_set = set()
label = []
ngram_pre_process(data, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10, ngram_range=(2, 2))
X = count_vect.fit_transform(words)
X_train = X
n = X_train.shape[0]
d = X_train.shape[1]
K = 2
dev_label, phis, psis = ngrams_process_lable(K, X_train, d, dev, label, n, train)
dev_label = np.array(dev_label)
dev_data = gram_trans(dev, X, count_vect)
dev_data = np.array(dev_data)
def nb_predictions(data, label, psis, phis):
label = label
x = data
n, d = x.shape
x = np.reshape(x, (1, n, d))
psis = np.reshape(psis, (K, 1, d))
psis = psis.clip(1e-14, 1 - 1e-14)
logpy = np.log(phis).reshape([K, 1])
logpxy = x * np.log(psis) + (1 - x) * np.log(1 - psis)
logpyx = logpxy.sum(axis=2) + logpy
return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K, n]), label
idx, logpyx, dev_label = nb_predictions(dev_data, dev_label, psis, phis)
F1 = f1_score(dev_label, idx)
print(F1)
return F1
def ngrams_process_lable(K, X_train, d, dev, label, n, train):
psis = np.zeros([K, d])
phis = np.zeros([K])
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
# compute the parameters
for k in range(K):
X_k = X_train[label == k]
psis[k] = np.mean(X_k, axis=0)
phis[k] = X_k.shape[0] / float(n)
dev_label = []
for i in range(len(dev)):
dev_label.append(int(dev[i][4]))
return dev_label, phis, psis
def ngram_pre_process(data, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
# print(temp)
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
def gram_trans(x, X, count_vect):
train = x
X_train = []
stop_words = count_vect.stop_words_
gram_trans_a(stop_words, train)
gram_trans_b(train)
gram_trans_c(X, X_train, count_vect, train)
return X_train
def gram_trans_c(X, X_train, count_vect, train):
for i in range(len(train)):
X_train.append([])
for j in range(X.shape[1]):
X_train[i].append(0)
for j in train[i][3]:
index = count_vect.vocabulary_.get(j)
if index == None:
continue
X_train[i][index] = 1
def gram_trans_b(train):
for i in range(len(train)):
temp_list = []
for j in range(0, len(train[i][3]) - 1):
temp_list.append(train[i][3][j] + " " + train[i][3][j + 1])
train[i][3] = temp_list
def gram_trans_a(stop_words, train):
for i in range(len(train)):
delete_list = set()
for j in train[i][3]:
if j in stop_words:
delete_list.add(j)
for j in delete_list:
train[i][3].remove(j)
def n_gram_naive_bayes(data, train, dev):
data = train
words = []
words_set = set()
label = []
ngram_pre_process(data, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10, ngram_range=(2, 2))
X = count_vect.fit_transform(words)
X_train = X
n = X_train.shape[0]
d = X_train.shape[1]
K = 2
dev_label, phis, psis = ngrams_process_lable(K, X_train, d, dev, label, n, train)
dev_label = np.array(dev_label)
dev_data = gram_trans(dev, X, count_vect)
dev_data = np.array(dev_data)
def nb_predictions(data, label, psis, phis):
label = label
x = data
n, d = x.shape
x = np.reshape(x, (1, n, d))
psis = np.reshape(psis, (K, 1, d))
psis = psis.clip(1e-14, 1 - 1e-14)
logpy = np.log(phis).reshape([K, 1])
logpxy = x * np.log(psis) + (1 - x) * np.log(1 - psis)
logpyx = logpxy.sum(axis=2) + logpy
return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K, n]), label
idx, logpyx, dev_label = nb_predictions(dev_data, dev_label, psis, phis)
F1 = f1_score(dev_label, idx)
print(F1)
return F1
def ngrams_process_lable(K, X_train, d, dev, label, n, train):
psis = np.zeros([K, d])
phis = np.zeros([K])
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
# compute the parameters
for k in range(K):
X_k = X_train[label == k]
psis[k] = np.mean(X_k, axis=0)
phis[k] = X_k.shape[0] / float(n)
dev_label = []
for i in range(len(dev)):
dev_label.append(int(dev[i][4]))
return dev_label, phis, psis
def ngram_pre_process(data, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
# print(temp)
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
In [137]:
Copied!
# import total_processed_data
n_gram_naive_bayes_F1 = n_gram_naive_bayes(total_processed_data, processed_data, processed_dev)
# import total_processed_data
n_gram_naive_bayes_F1 = n_gram_naive_bayes(total_processed_data, processed_data, processed_dev)
0.0
In [99]:
Copied!
def gram_trans(x, X, count_vect):
train = x
X_train = []
stop_words = count_vect.stop_words_
gram_trans_preprocess_a(stop_words, train)
gram_trans_preprocess_b(train)
gram_trans_preprocess_c(X, X_train, count_vect, train)
return X_train
def gram_trans_preprocess_c(X, X_train, count_vect, train):
for i in range(len(train)):
X_train.append([])
for j in range(X.shape[1]):
X_train[i].append(0)
for j in train[i][3]:
index = count_vect.vocabulary_.get(j)
if index is None:
continue
X_train[i][index] = 1
def gram_trans_preprocess_b(train):
for i in range(len(train)):
temp_list = []
for j in range(0, len(train[i][3]) - 1):
temp_list.append(train[i][3][j] + " " + train[i][3][j + 1])
train[i][3] = temp_list
def gram_trans_preprocess_a(stop_words, train):
for i in range(len(train)):
delete_list = set()
for j in train[i][3]:
if j in stop_words:
delete_list.add(j)
for j in delete_list:
train[i][3].remove(j)
def n_gram_logistic(data, train, dev):
# decide the threshold M
# I am not sure if we should use 0.7 part or the whole part to consturct a word bag
data = train
words = []
words_set = set()
label = []
ngram_logi_preprocess(data, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10, ngram_range=(2, 2))
X = count_vect.fit_transform(words)
X_train = X
n = X_train.shape[0]
d = X_train.shape[1]
K = 2
# shapes of parameters
dev_data, dev_label, label = ngram_logi_lable(
K, X, X_train, count_vect, d, dev, label, n, train
)
logreg = LogisticRegression(
C=1e5, multi_class="multinomial", verbose=True, max_iter=1000
)
logreg.fit(X_train, label)
dev_predict = logreg.predict(dev_data)
F1 = f1_score(dev_label, dev_predict)
print(F1)
influential_words = ngram_logi_influential_words(count_vect, logreg)
return influential_words, F1
def ngram_logi_influential_words(count_vect, logreg):
coef = logreg.coef_
coef = [abs(i) for i in coef[0]]
max_num_index_list = map(coef.index, heapq.nlargest(10, coef))
max_num_index_list = list(max_num_index_list)
import_words = set()
most_ = max_num_index_list
for i in count_vect.vocabulary_.keys():
if count_vect.vocabulary_[i] in most_:
import_words.add(i)
influential_words = []
for i in import_words:
influential_words.append(i)
print(influential_words)
return influential_words
def ngram_logi_lable(K, X, X_train, count_vect, d, dev, label, n, train):
psis = np.zeros([K, d])
phis = np.zeros([K])
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
# compute the parameters
for k in range(K):
X_k = X_train[label == k]
psis[k] = np.mean(X_k, axis=0)
phis[k] = X_k.shape[0] / float(n)
dev_label = []
for i in range(len(dev)):
dev_label.append(int(dev[i][4]))
dev_label = np.array(dev_label)
dev_data = gram_trans(dev, X, count_vect)
dev_data = np.array(dev_data)
return dev_data, dev_label, label
def ngram_logi_preprocess(data, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
def gram_trans(x, X, count_vect):
train = x
X_train = []
stop_words = count_vect.stop_words_
gram_trans_preprocess_a(stop_words, train)
gram_trans_preprocess_b(train)
gram_trans_preprocess_c(X, X_train, count_vect, train)
return X_train
def gram_trans_preprocess_c(X, X_train, count_vect, train):
for i in range(len(train)):
X_train.append([])
for j in range(X.shape[1]):
X_train[i].append(0)
for j in train[i][3]:
index = count_vect.vocabulary_.get(j)
if index is None:
continue
X_train[i][index] = 1
def gram_trans_preprocess_b(train):
for i in range(len(train)):
temp_list = []
for j in range(0, len(train[i][3]) - 1):
temp_list.append(train[i][3][j] + " " + train[i][3][j + 1])
train[i][3] = temp_list
def gram_trans_preprocess_a(stop_words, train):
for i in range(len(train)):
delete_list = set()
for j in train[i][3]:
if j in stop_words:
delete_list.add(j)
for j in delete_list:
train[i][3].remove(j)
def n_gram_logistic(data, train, dev):
# decide the threshold M
# I am not sure if we should use 0.7 part or the whole part to consturct a word bag
data = train
words = []
words_set = set()
label = []
ngram_logi_preprocess(data, words, words_set)
count_vect = CountVectorizer(binary=True, min_df=10, ngram_range=(2, 2))
X = count_vect.fit_transform(words)
X_train = X
n = X_train.shape[0]
d = X_train.shape[1]
K = 2
# shapes of parameters
dev_data, dev_label, label = ngram_logi_lable(
K, X, X_train, count_vect, d, dev, label, n, train
)
logreg = LogisticRegression(
C=1e5, multi_class="multinomial", verbose=True, max_iter=1000
)
logreg.fit(X_train, label)
dev_predict = logreg.predict(dev_data)
F1 = f1_score(dev_label, dev_predict)
print(F1)
influential_words = ngram_logi_influential_words(count_vect, logreg)
return influential_words, F1
def ngram_logi_influential_words(count_vect, logreg):
coef = logreg.coef_
coef = [abs(i) for i in coef[0]]
max_num_index_list = map(coef.index, heapq.nlargest(10, coef))
max_num_index_list = list(max_num_index_list)
import_words = set()
most_ = max_num_index_list
for i in count_vect.vocabulary_.keys():
if count_vect.vocabulary_[i] in most_:
import_words.add(i)
influential_words = []
for i in import_words:
influential_words.append(i)
print(influential_words)
return influential_words
def ngram_logi_lable(K, X, X_train, count_vect, d, dev, label, n, train):
psis = np.zeros([K, d])
phis = np.zeros([K])
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
# compute the parameters
for k in range(K):
X_k = X_train[label == k]
psis[k] = np.mean(X_k, axis=0)
phis[k] = X_k.shape[0] / float(n)
dev_label = []
for i in range(len(dev)):
dev_label.append(int(dev[i][4]))
dev_label = np.array(dev_label)
dev_data = gram_trans(dev, X, count_vect)
dev_data = np.array(dev_data)
return dev_data, dev_label, label
def ngram_logi_preprocess(data, words, words_set):
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
In [100]:
Copied!
n_gram_important_words, n_gram_log_F1 = n_gram_logistic(total_processed_data, processed_data, processed_dev)
n_gram_important_words, n_gram_log_F1 = n_gram_logistic(total_processed_data, processed_data, processed_dev)
0.0 ['train derail', 'train derailment', 'nuclear disaster', 'hundreds migrants', 'reunion island', 'break news', 'rescuers search', 'northern california', 'hiroshima nagasaki', 'helicopter crash']
In [101]:
Copied!
def append_data(data):
for i in range(len(data)):
temp_str = ""
temp_str = temp_str + data[i][1] + " "
temp_str = temp_str + data[i][2] + " "
# print(data[i])
temp_str = temp_str + " ".join(data[i][3])
data[i][3] = temp_str
return data
def append_data(data):
for i in range(len(data)):
temp_str = ""
temp_str = temp_str + data[i][1] + " "
temp_str = temp_str + data[i][2] + " "
# print(data[i])
temp_str = temp_str + " ".join(data[i][3])
data[i][3] = temp_str
return data
In [102]:
Copied!
append_total_data = append_data(data)
append_train_set = append_data(train_set)
append_dev_set = append_data(dev_set)
append_total_data = append_data(data)
append_train_set = append_data(train_set)
append_dev_set = append_data(dev_set)
In [103]:
Copied!
total_processed_data = preprocess_data(append_total_data)
processed_data = preprocess_data(append_train_set)
processed_dev = preprocess_data(append_dev_set)
total_processed_data = preprocess_data(append_total_data)
processed_data = preprocess_data(append_train_set)
processed_dev = preprocess_data(append_dev_set)
In [104]:
Copied!
naive_bayes_F1 = naive_bayes(total_processed_data, processed_data, processed_dev)
important_words, log_F1 = logistic(total_processed_data, processed_data, processed_dev)
n_gram_naive_bayes_F1 = n_gram_naive_bayes(total_processed_data, processed_data, processed_dev)
n_gram_important_words, n_gram_log_F1 = n_gram_logistic(total_processed_data, processed_data, processed_dev)
naive_bayes_F1 = naive_bayes(total_processed_data, processed_data, processed_dev)
important_words, log_F1 = logistic(total_processed_data, processed_data, processed_dev)
n_gram_naive_bayes_F1 = n_gram_naive_bayes(total_processed_data, processed_data, processed_dev)
n_gram_important_words, n_gram_log_F1 = n_gram_logistic(total_processed_data, processed_data, processed_dev)
0.679266895761741
C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
0.6513911620294599 ['hiroshima', 'debris', 'libya', 'typhoon', 'return', 'mosque', 'space', 'ignition', 'legionnaires', 'abc'] 0.32157506152584087 0.0 ['train derail', 'train derailment', 'nuclear disaster', 'hundreds migrants', 'reunion island', 'break news', 'rescuers search', 'miss malaysia', 'hiroshima nagasaki', 'helicopter crash']
In [105]:
Copied!
def predict_test(train, dev):
data = train
id = []
label, words = test_pre_process(data, dev, id)
count_vect = CountVectorizer(binary=True, min_df=10)
X = count_vect.fit_transform(words).toarray()
X_train = trans(train, X, count_vect)
X_train = np.array(X_train)
n = X_train.shape[0]
d = X_train.shape[1]
K = 2
phis, psis = test_lable_preprocess(K, X_train, d, label, n, train)
dev_data = trans(dev, X, count_vect)
dev_data = np.array(dev_data)
def nb_predictions(data, psis, phis):
x = data
n, d = x.shape
x = np.reshape(x, (1, n, d))
psis = np.reshape(psis, (K, 1, d))
psis = psis.clip(1e-14, 1 - 1e-14)
logpy = np.log(phis).reshape([K, 1])
logpxy = x * np.log(psis) + (1 - x) * np.log(1 - psis)
logpyx = logpxy.sum(axis=2) + logpy
return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K, n])
idx, logpyx = nb_predictions(dev_data, psis, phis)
print(len(idx))
print(len(id))
ex = {"id": id, "target": idx}
d = DataFrame(ex)
d.to_csv("./submit.csv", index=False)
def test_lable_preprocess(K, X_train, d, label, n, train):
psis = np.zeros([K, d])
phis = np.zeros([K])
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
for k in range(K):
X_k = X_train[label == k]
psis[k] = np.mean(X_k, axis=0)
phis[k] = X_k.shape[0] / float(n)
return phis, psis
def test_pre_process(data, dev, id):
for i in range(len(dev)):
id.append(dev[i][0])
words = []
words_set = set()
label = []
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
return label, words
def predict_test(train, dev):
data = train
id = []
label, words = test_pre_process(data, dev, id)
count_vect = CountVectorizer(binary=True, min_df=10)
X = count_vect.fit_transform(words).toarray()
X_train = trans(train, X, count_vect)
X_train = np.array(X_train)
n = X_train.shape[0]
d = X_train.shape[1]
K = 2
phis, psis = test_lable_preprocess(K, X_train, d, label, n, train)
dev_data = trans(dev, X, count_vect)
dev_data = np.array(dev_data)
def nb_predictions(data, psis, phis):
x = data
n, d = x.shape
x = np.reshape(x, (1, n, d))
psis = np.reshape(psis, (K, 1, d))
psis = psis.clip(1e-14, 1 - 1e-14)
logpy = np.log(phis).reshape([K, 1])
logpxy = x * np.log(psis) + (1 - x) * np.log(1 - psis)
logpyx = logpxy.sum(axis=2) + logpy
return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K, n])
idx, logpyx = nb_predictions(dev_data, psis, phis)
print(len(idx))
print(len(id))
ex = {"id": id, "target": idx}
d = DataFrame(ex)
d.to_csv("./submit.csv", index=False)
def test_lable_preprocess(K, X_train, d, label, n, train):
psis = np.zeros([K, d])
phis = np.zeros([K])
for i in range(len(train)):
label.append(int(train[i][4]))
label = np.array(label)
for k in range(K):
X_k = X_train[label == k]
psis[k] = np.mean(X_k, axis=0)
phis[k] = X_k.shape[0] / float(n)
return phis, psis
def test_pre_process(data, dev, id):
for i in range(len(dev)):
id.append(dev[i][0])
words = []
words_set = set()
label = []
for i in range(len(data)):
temp = ""
for j in range(len(data[i][3])):
temp += data[i][3][j]
words_set.add(data[i][3][j])
if j != len(data[i][3]) - 1:
temp += " "
words.append(temp)
return label, words
In [106]:
Copied!
test_data = preprocess_data(test_data)
predict_test(total_processed_data, test_data)
test_data = preprocess_data(test_data)
predict_test(total_processed_data, test_data)
3263 3263
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!
In [ ]:
Copied!