どん底から這い上がるまでの記録

どん底から這い上がりたいけど這い上がれない人がいろいろ書くブログ(主にプログラミング)

PyTorchを使ってSMSSpamCollectionの分類をしてみる。

環境

  • python 3.6.2
  • Anaconda 4.3.27
  • Windows10
  • scikit-learn 0.19.0
  • nltk 3.2.4
  • pytorch 0.2.1

前準備

import argparse
import codecs
import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as O
from torch.autograd import Variable


parser = argparse.ArgumentParser()
parser.add_argument('--batchsize', '-b', type=int, default=50,
                    help="Number of minibatchsize...")
parser.add_argument('--epoch', '-e', type=int, default=100,
                    help="Number of epochs...")
parser.add_argument('--learning_rate', '-lr', type=float, default=0.1,
                    help="Learning rate...")
parser.add_argument('--units', '-u', type=int, default=100,
                    help="Number of units...")
args = parser.parse_args()

データのダウンロード

今回使うデータはカリフォルニア大学アーバイン校が提供しているMachine Learning RepositoryのSMS Spam Collectionです。

https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

からsmsspamcollection.zipをダウンロード。
smsspamcollection.zipを解凍すると、SMSSpamCollectionファイルがあるのでこれを使っていきます。

データの中身

SMSSpamCollectionの中身は以下のようになっています。

行の先頭にラベル(hamかspam)、それにそのラベルのテキストデータ(下線)がタブ区切りで続いています。

ham Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

ham Ok lar... Joking wif u oni...

spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

ham U dun say so early hor... U c already then say...

ham Nah I don't think he goes to usf, he lives around here though

spam FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv

全体のデータのサイズは5574。

hamデータは4827。

spamデータは747。

with codecs.open("SMSSpamCollection", "r", "utf-8") as f:
    data = f.read().splitlines()
N = len(data) #5574
ham_size = sum(d.split('\t')[0] == "ham" for d in data) #4827
spam_size = N - ham_size #747
print("Number of data: {}\nNumber of ham data: {}\nNumber of spam data: {}".format(N, ham_size, spam_size))

データの前処理

データの分割

手に入れたデータをテキストデータとラベルデータに分ける。 spamのラベルは1に、hamは0にする。

text = [d.split('\t')[1] for d in data]
label = [[0, 1][d.split('\t')[0]=="spam"] for d in data]

正規化

textから句読点を除く。

text = ["".join(ch for ch in line if ch not in string.punctuation) for line in text]

textを全て小文字に変換する。

text = [line.lower() for line in text]

ストップワードとテキストデータに3回以上出現しない単語を取り除く

#Stopword List
stopwords_list = set(stopwords.words('english'))
#Word dictionary which contains the word appearing more than 3 times in dataset
word2id = {"<UNK>": 0}
#Remove stopwords
word_list = set([word for line in text \
                for word in word_tokenize(line) if word not in stopwords_list])

cnt = Counter()
#Count all word appear in dataset
for line in text:
    for word in line.split():
        if word in word_list:
            cnt[word] += 1

for word in word_list:
    if word not in word2id and cnt[word] >= 3:
        word2id[word] = len(word2id)

Bag of Words

今回はBoWを使う。 正規化したテキストデータをBoWに変換する。

bow_set = []
for line in text:
    bow = [0] * len(word2id)
    for word in line.split():
        try:
            bow[word2id[word]] += 1
        except:
            pass
    bow_set.append(bow)

データの分割

訓練データとテストデータに分ける。

train_x, test_x, train_t, test_t = train_test_split(bow_set, label, test_size=0.1)

Training

ネットワークの定義

入力層、隠れ層、出力層からなるニューラルネットワーク

class Bag_Of_Words(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super(Bag_Of_Words, self).__init__()
        self.l1 = nn.Linear(n_in, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_hidden)
        self.l3 = nn.Linear(n_hidden, n_out)

    def forward(self, x):
        h = F.relu(self.l1(x))
        h = F.relu(self.l2(h))
        y = F.softmax(self.l3(h))
        return y

パラメータの設定

batchsize = args.batchsize
epochs = args.epoch
learning_rate = args.learning_rate
n_in = len(word2id)
n_hidden = args.units
n_out = 2
n_batch = len(train_x) // batchsize
history = {"loss": []}

#モデルの構築
model = Bag_Of_Words(n_in, n_hidden, n_out)
#損失関数
criterion = nn.CrossEntropyLoss()
#最適化関数
optimizer = O.Adam(model.parameters(), lr=learning_rate)

学習

for epoch in range(epochs):
    print("Epoch {}".format(epoch))
    train_x, train_t = shuffle(train_x, train_t)
    for i in range(n_batch):
        # Get mini batch data...
        start = i * batchsize
        end = start + batchsize
        x = Variable(torch.FloatTensor(train_x[start:end]))
        t = Variable(torch.LongTensor(train_t[start:end]))
        optimizer.zero_grad()
        y = model(x)
        loss = criterion(y, t)
        loss.backward()
        optimizer.step()
    history['loss'].append(loss.data[0])

Test

以下のコードで学習したモデルにテストデータを渡し、正解率を確認する。

var = Variable(torch.FloatTensor(test_x), requires_grad=False)
result = model(var)
predicted = torch.max(result, 1)[1]
print("{:.2f}".format(sum(p == t for p, t in zip(predicted.data, test_t)) / len(test_t)))

結果

0.92

結果は0.92となりました。何回か学習しテストしてみましたが大体9割前後の正答率でした。

コード

import argparse
import codecs
import string
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as O
from torch.autograd import Variable

class Bag_Of_Words(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super(Bag_Of_Words, self).__init__()
        self.l1 = nn.Linear(n_in, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_hidden)
        self.l3 = nn.Linear(n_hidden, n_out)

    def forward(self, x):
        h = F.relu(self.l1(x))
        h = F.relu(self.l2(h))
        y = F.softmax(self.l3(h))
        return y


parser = argparse.ArgumentParser()
parser.add_argument('--batchsize', '-b', type=int, default=50,
                    help="Number of minibatchsize...")
parser.add_argument('--epoch', '-e', type=int, default=100,
                    help="Number of epochs...")
parser.add_argument('--learning_rate', '-lr', type=float, default=0.1,
                    help="Learning rate...")
parser.add_argument('--units', '-u', type=int, default=100,
                    help="Number of units...")
args = parser.parse_args()

# Load data...
path = "SMSSpamCollection"
with codecs.open(path, "r", "utf-8") as f:
    data = f.read().splitlines()

# Split data into text data and label one.
text = [d.split('\t')[1] for d in data]
label = [[0, 1][d.split('\t')[0]=="spam"] for d in data]

# Normalization
# Remove punctuation
text = ["".join(ch for ch in line if ch not in string.punctuation) for line in text]
# Change all characters to lowercase
text = [line.lower() for line in text]

#Stopword List
stopwords_list = set(stopwords.words('english'))
#Word dictionary which contains the word appearing more than 3 times in dataset
word2id = {"<UNK>": 0}
#Remove stopwords
word_list = set([word for line in text \
                for word in word_tokenize(line) if word not in stopwords_list])

cnt = Counter()
#Count all word appear in dataset
for line in text:
    for word in line.split():
        if word in word_list:
            cnt[word] += 1

for word in word_list:
    if word not in word2id and cnt[word] >= 3:
        word2id[word] = len(word2id)

bow_set = []
for line in text:
    bow = [0] * len(word2id)
    for word in line.split():
        try:
            bow[word2id[word]] += 1
        except:
            pass
    bow_set.append(bow)

# Split data into training set and test one.
train_x, test_x, train_t, test_t = train_test_split(bow_set, label, test_size=0.1)

# Setting parameters
batchsize = args.batchsize
epochs = args.epoch
learning_rate = args.learning_rate
n_in = len(word2id)
n_hidden = args.units
n_out = 2
n_batch = len(train_x) // batchsize
history = {"loss": []}

# Model
model = Bag_Of_Words(n_in, n_hidden, n_out)
# Loss
criterion = nn.CrossEntropyLoss()
# Optimizer
optimizer = O.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    print("Epoch {}".format(epoch))
    # Shuffle
    train_x, train_t = shuffle(train_x, train_t)
    for i in range(n_batch):
        # Get mini batch data...
        start = i * batchsize
        end = start + batchsize
        x = Variable(torch.FloatTensor(train_x[start:end]))
        t = Variable(torch.LongTensor(train_t[start:end]))
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        y = model(x)
        loss = criterion(y, t)
        loss.backward()
        optimizer.step()
    history['loss'].append(loss.data[0])

var = Variable(torch.FloatTensor(test_x), requires_grad=False)
result = model(var)
predicted = torch.max(result, 1)[1]
print("{:.2f}".format(sum(p == t for p, t in zip(predicted.data, test_t)) / len(test_t)))


github.com