การตรวจจับข้อความสแปมใน SMS โดยใช้ TensorFlow

Delta fare — Sun, 14 Apr 2024 06:49:39 +0000

ในชีวิตประจำวันเราอยู่กับการใช้โทรศัพท์ติดต่อสื่อสารกัน แต่ในบางครั้งอาจจะมีมิจฉาชีพส่งข้อความเพื่อทำให้เรารำคาญ โดยเขียนข้อความหลอกให้เรากรอกข้อมูลส่วนตัวเราลงไปเช่น อีเมล์ เบอร์โทรศัพท์ หรือบัญชีธนาคาร ถ้าเราเผลอกรอกลงไปอาจจะทำให้ข้อมูลส่วนตัวรั่วไหลได้
ในบทความนี้ เราจะนำพัฒนาระบบ Deep learning โดยใช้ TensorFlow ในการตรวจจับข้อความสแปมและวิเคราะห์ตัวชี้วัดประสิทธิภาพของโมเดล
โดยจะใช้ dataset ที่มีข้อความ SMS และข้อความสแปมหรือ ham
ดาวน์โหลด ที่นี่

ขั้นตอนการทำ

นำเข้า library ทั้งหมดเข้าไปใน Google Colab

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

โหลดชุดข้อมูลโดยใช้ฟังชั่น .read_csv() ของ pandas

df = pd.read_csv("/content/spam.csv",encoding='latin-1')
df.head()

โดยให้นำเข้าชุดข้อมูลที่ดาวน์โหลดไว้ อัปโหลดบน Google Colab

เมื่อโหลดข้อมูลและรันเรียบร้อย เราจะได้ตารางข้อมูลมา จะเห็นได้ว่าชุดข้อมูลประกอบด้วยคอลัมน์ที่ไม่มีชื่อ 3 คอลัมน์มีค่า null ดังนั้นเราจะลบคอลัมน์นั้นและเปลี่ยนชื่อคอลัมน์จาก v1 และ v2 เป็น label และ Text ตามลำดับ เนื่องจากตัวแปรเป้าหมายอยู่ในรูปแบบของข้อความ เราจะเข้ารหัสเป็นตัวเลขโดยใช้ฟังก์ชัน .map() ของ pandas

df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
df = df.rename(columns={'v1':'label','v2':'Text'})
df['label_enc'] = df['label'].map({'ham':0,'spam':1})
df.head()

ข้อมูลหลังจากเปลี่ยนตัวแปรแล้ว

ต่อไปมาดูการกระจายตัวข้อมูลทั้งสอง

sns.countplot(x=df['label'])
plt.show()

ข้อมูลประเภท ham มีจำนวนมากกว่า spam เป็นเรื่องปกติ โดยที่เรากำลังจะฝังข้อมูลในโมเดล deep learning เราไม่จำเป็นต้องสมดุลข้อมูล เราจะหาค่าเฉลี่ยของจำนวนคำในประโยคทั้งหมดในข้อมูล SMS

avg_words_len=round(sum([len(i.split()) for i in df['Text']])/len(df['Text']))
print(avg_words_len)

ผลลัพธ์จาก code คือจำนวนคำเฉลี่ยใน SMS

ต่อไปหาคำที่ไม่ซ้ำทั้งหมดกัน

s = set()
for sent in df['Text']:
  for word in sent.split():
      s.add(word)
total_words_length=len(s)
print(total_words_length)

คำที่ไม่ซ้ำมีทั้งหมด

ต่อไป แบ่งข้อมูลเป็นส่วนฝึกและทดสอบโดยใช้ฟังชั่น train_test_split()

from sklearn.model_selection import train_test_split

X, y = np.asanyarray(df['Text']), np.asanyarray(df['label_enc'])
new_df = pd.DataFrame({'Text': X, 'label': y})
X_train, X_test, y_train, y_test = train_test_split(
    new_df['Text'], new_df['label'], test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

ข้อมูลสำหรับการฝึกและทดสอบ

ต่อไปเราจะเริ่มสร้างโมเดลพื้นฐานก่อน จากนั้นจะเพิ่มประสิทธิภาพของโมเดลโดยใช้โมเดล deep learning เช่น การฝังข้อมูล (embeddings) และ LSTM

เราจะใช้ MultinomialNB() ในการแยกข้อความที่มีลักษณะไม่ต่อเนื่องเหมือนคำพูดหรือคำในเวกเตอร์ tf-idf โดยค่า if-idf หาว่าคำนั้นเกี่ยวข้องกับเนื้อหาเราไหม

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,accuracy_score

tfidf_vec = TfidfVectorizer().fit(X_train)
X_train_vec,X_test_vec = tfidf_vec.transform(X_train),tfidf_vec.transform(X_test)

baseline_model = MultinomialNB()
baseline_model.fit(X_train_vec,y_train)

ประสิทธิภาพของโมเดลพื้นฐาน

nb_accuracy = accuracy_score(y_test,baseline_model.predict(X_test_vec))
print(nb_accuracy)
print(classification_report(y_test, baseline_model.predict(X_test_vec)))

การสร้างชั้นการเวกเตอร์ข้อความแบบกำหนดเอง การสร้างชั้นการเวกเตอร์ข้อความเป็นการทำให้ข้อมูลถูกถอดรหัสเป็นตัวเลข เช่น ความถี่ของคำ Binary Term เป็นต้น

from tensorflow.keras.layers import TextVectorization

MAXTOKENS=total_words_length
OUTPUTLEN=avg_words_len

text_vec = TextVectorization(
    max_tokens=MAXTOKENS,
    standardize='lower_and_strip_punctuation',
    output_mode='int',
    output_sequence_length=OUTPUTLEN
)
text_vec.adapt(X_train)

โดย MAXTOKEN คือขนาดมากทุกสุดของพจนานุกรม (vocabulary)
และ OUTPUTLEN คือความยาวที่ประโยคควรจะถูกเติมความยาว (padding) ไม่ว่าความยาวของประโยคจะเป็นอย่างไร
ตัวอย่างประโยค โดยใช้การสร้างชั้นเวกเตอร์ข้อความ

ต่อไปสร้างเลเยอร์สำหรับฝังข้อมูล

embedding_layer = layers.Embedding(
    input_dim=MAXTOKENS,
    output_dim=128,
    embeddings_initializer='uniform',
    input_length=OUTPUTLEN
)

input_dim คือขนาดของพจนานุกรม (vocabulary)
output_dim ขนาดของเวกเตอร์ที่คำจะถูกฝังลงไป
input_length คือความยาวของลำดับของข้อมูลนำเข้า
สร้างและทดสอบโมเดลแรก

input_layer = layers.Input(shape=(1,), dtype=tf.string)
vec_layer = text_vec(input_layer)
embedding_layer_model = embedding_layer(vec_layer)
x = layers.GlobalAveragePooling1D()(embedding_layer_model)
x = layers.Flatten()(x)
x = layers.Dense(32, activation='relu')(x)
output_layer = layers.Dense(1, activation='sigmoid')(x)
model_1 = keras.Model(input_layer, output_layer)

model_1.compile(optimizer='adam', loss=keras.losses.BinaryCrossentropy(
    label_smoothing=0.5), metrics=['accuracy'])

ข้อสรุปของโมเดลแรก

การทดสอบโมเดลแรก

ตารางประวัติการทำงานโมเดลแรก

ต่อไปสร้างฟังก์ชันช่วยสำหรับคอมไพล์, การฝึกและประเมินประสิทธิภาพของโมเดล

from sklearn.metrics import precision_score, recall_score, f1_score

def compile_model(model):
    '''
    simply compile the model with adam optimzer
    '''
    model.compile(optimizer=keras.optimizers.Adam(),
                loss=keras.losses.BinaryCrossentropy(),
                metrics=['accuracy'])

def fit_model(model, epochs, X_train=X_train, y_train=y_train,
            X_test=X_test, y_test=y_test):
    '''
    fit the model with given epochs, train 
    and test data
    '''
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        validation_data=(X_test, y_test),
                        validation_steps=int(0.2*len(X_test)))
    return history

def evaluate_model(model, X, y):
    '''
    evaluate the model and returns accuracy, 
    precision, recall and f1-score 
    '''
    y_preds = np.round(model.predict(X))
    accuracy = accuracy_score(y, y_preds)
    precision = precision_score(y, y_preds)
    recall = recall_score(y, y_preds)
    f1 = f1_score(y, y_preds)

    model_results_dict = {'accuracy': accuracy,
                        'precision': precision,
                        'recall': recall,
                        'f1-score': f1}

    return model_results_dict

Bidirectional LSTM Bidirectional LSTM (Long short-term memory) ประกอบด้วย LSTMs สองตัว โดยมีหนึ่งตัวรับข้อมูลในทิศทางหนึ่งและอีกตัวหนึ่งในทิศทางอื่น โดย BiLSTMs เพิ่มประสิทธิภาพของข้อมูลที่สามารถเข้าถึงได้ให้กับเครือข่าย เพิ่มความเข้าใจของข้อความสำหรับอัลกอริทึม สร้างโมเดลตัวที่สอง

input_layer = layers.Input(shape=(1,), dtype=tf.string)
vec_layer = text_vec(input_layer)
embedding_layer_model = embedding_layer(vec_layer)
bi_lstm = layers.Bidirectional(layers.LSTM(
    64, activation='tanh', return_sequences=True))(embedding_layer_model)
lstm = layers.Bidirectional(layers.LSTM(64))(bi_lstm)
flatten = layers.Flatten()(lstm)
dropout = layers.Dropout(.1)(flatten)
x = layers.Dense(32, activation='relu')(dropout)
output_layer = layers.Dense(1, activation='sigmoid')(x)
model_2 = keras.Model(input_layer, output_layer)

compile_model(model_2) # compile the model
history_2 = fit_model(model_2, epochs=5) # fit the model

ผลทดสอบโมเดลตัวที่สอง

Transfer Learning ด้วย USE Encoder Transfer learning คือการนำโมเดลอื่นมาใช้งานร่วมกับโมเดลที่เราใช้อยู่ Universal Sentence Encoder คือการแปลงข้อความเป็นเวกเตอร์ในการจำแนกประเภท ความหมาย และการประยุกต์คำ USE สามารถโหลดได้จาก tensorflow_hub และสามารถใช้เป็นชั้นได้โดยใช้ฟังก์ชัน .kerasLayer()

import tensorflow_hub as hub

# model with Sequential api
model_3 = keras.Sequential()

# universal-sentence-encoder layer
# directly from tfhub
use_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                        trainable=False,
                        input_shape=[],
                        dtype=tf.string,
                        name='USE')
model_3.add(use_layer)
model_3.add(layers.Dropout(0.2))
model_3.add(layers.Dense(64, activation=keras.activations.relu))
model_3.add(layers.Dense(1, activation=keras.activations.sigmoid))

compile_model(model_3)

history_3 = fit_model(model_3, epochs=5)

ผลการทดสอบโมเดลที่สาม

ต่อไปทำการวิเคราะห์ประสิทธิภาพของโมเดลทั้งหมด

baseline_model_results = evaluate_model(baseline_model, X_test_vec, y_test)
model_1_results = evaluate_model(model_1, X_test, y_test)
model_2_results = evaluate_model(model_2, X_test, y_test)
model_3_results = evaluate_model(model_3, X_test, y_test)

total_results = pd.DataFrame({'MultinomialNB Model':baseline_model_results,
                            'Custom-Vec-Embedding Model':model_1_results,
                            'Bidirectional-LSTM Model':model_2_results,
                            'USE-Transfer learning Model':model_3_results}).transpose()

total_results

สรุปผล
หลักจากการทดสอบโมเดลทั้งสี่แบบแล้วพบว่ามีความแม่นยำมากกว่า 96% ทั้งนี้อาจจะสังเกตยาก เนื่องจากมีความแม่นยำที่สูงและไม่มีความแตกต่าง แต่มี metric ที่มีความสำคัญที่ทำให้เราสามารถคำนวนหาประสิทธิภาพโมเดลได้ดีก็คือ f1-score เราจึงสามารถระบุได้แล้วว่าโมเดล USE-Transfer Learning มีความแม่นยำและประสิทธิภาพที่ดีสุด โดยมี f1-score ที่มากที่สุด

โดยในที่นี่เป็นเพียงชุดข้อมูลเพียงชุดเดียว หากมีชุดข้อมูลที่มากหรือละเอียดกว่านี้ก็จะช่วยให้ได้ผลที่แยกอย่างได้ชัดแน่นอน :)

ชุดข้อมูล

DEV Community: Delta fare

การตรวจจับข้อความสแปมใน SMS โดยใช้ TensorFlow

ขั้นตอนการทำ