Pytorch Text Classification
Text sentiment analysis is a fundamental task in Natural Language Processing (NLP), aimed at determining the emotional tendency (positive/negative) expressed in a piece of text. This project will use PyTorch to build a deep learning model to implement sentiment classification of movie reviews.
### Application Scenarios of Sentiment Analysis
* Product review analysis
* Social media public opinion monitoring
* Customer service feedback classification
* Market trend prediction
* * *
## Environment Setup
### Required Tools and Libraries
## Instance
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, TabularDataset, BucketIterator
import spacy
import numpy as np
### Install Dependencies
pip install torch torchtext spacy python -m spacy download en_core_web_sm
* * *
## Data Preparation
### Dataset Introduction
Using the IMDB movie review dataset, which contains 50,000 reviews with sentiment labels (positive/negative).
### Data Preprocessing
## Instance
# Define Field Processing
TEXT = Field(tokenize='spacy',
tokenizer_language='en_core_web_sm',
include_lengths=True)
LABEL = Field(sequential=False, use_vocab=False)
# Load dataset
train_data, test_data = TabularDataset.splits(
path='./data',
train='train.csv',
test='test.csv',
format='csv',
fields=[('text', TEXT),('label', LABEL)]
)
# Build Vocabulary
TEXT.build_vocab(train_data,
max_size=25000,
vectors="glove.6B.100d")
* * *
## Model Building
### LSTM Model Architecture
!(#)
### 4.2 Model Implementation Code
## Instance
class SentimentLSTM(nn.Module):
def __init__ (self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers):
super(). __init__ ()
self.embedding= nn.Embedding(vocab_size, embedding_dim)
self.lstm= nn.LSTM(embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=True)
self.fc= nn.Linear(hidden_dim * 2, output_dim)
self.dropout= nn.Dropout(0.5)
def forward(self, text, text_lengths):
embedded =self.dropout(self.embedding(text))
packed_embedded = nn.utils.rnn.pack_padded_sequence(
embedded, text_lengths.to('cpu'))
packed_output,(hidden, cell)=self.lstm(packed_embedded)
hidden =self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
return self.fc(hidden)
* * *
## Model Training
### Training Parameter Settings
## Instance
# Model Parameters
INPUT_DIM =len(TEXT.vocab)
EMBEDDING_DIM =100
HIDDEN_DIM =256
OUTPUT_DIM =1
N_LAYERS =2
# Initialize Model
model = SentimentLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS)
# Optimizer and Loss Function
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
### Training Loop
## Instance
def train(model, iterator, optimizer, criterion):
epoch_loss =0
epoch_acc =0
model.train()
for batch in iterator:
text, text_lengths = batch.text
predictions = model(text, text_lengths).squeeze(1)
loss = criterion(predictions, batch.label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += accuracy(predictions, batch.label)
return epoch_loss / len(iterator), epoch_acc / len(iterator)
* * *
## Model Evaluation
### Evaluation Function
## Instance
def evaluate(model, iterator, criterion):
epoch_loss =0
epoch_acc =0
model.eval()
with torch.no_grad():
for batch in iterator:
text, text_lengths = batch.text
predictions = model(text, text_lengths).squeeze(1)
loss = criterion(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += accuracy(predictions, batch.label)
return epoch_loss / len(iterator), epoch_acc / len(iterator)
### Accuracy Calculation
## Instance
def accuracy(preds, y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct =(rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
* * *
## Model Application
### Predicting New Text
## Instance
def predict_sentiment(model, sentence):
tokenized =[tok.text for tok in nlp.tokenizer(sentence)]
indexed =[TEXT.vocab.stoifor t in tokenized]
length =[len(indexed)]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
length_tensor = torch.LongTensor(length)
prediction = torch.sigmoid(model(tensor, length_tensor))
return prediction.item()
### Example Prediction
## Instance
positive_review ="This movie was fantastic! I really enjoyed it."
negative_review ="The film was terrible and boring."
print(f"Positive review score: {predict_sentiment(model, positive_review):.4f}")
print(f"Negative review score: {predict_sentiment(model, negative_review):.4f}")
YouTip