# Bert英文文本分类练习

**Repository Path**: AwakeBo/BERT_english-text-classification-exercises

## Basic Information

- **Project Name**: Bert英文文本分类练习
- **Description**: Bert练手项目
- **Primary Language**: Unknown
- **License**: Not specified
- **Default Branch**: master
- **Homepage**: None
- **GVP Project**: No

## Statistics

- **Stars**: 0
- **Forks**: 1
- **Created**: 2025-05-23
- **Last Updated**: 2025-07-30

## Categories & Tags

**Categories**: Uncategorized

**Tags**: bert

## README

# Bert英文文本分类

## 完整代码

```python
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings

warnings.filterwarnings('ignore')
import datetime

# 获得计算机当前时间
starttime = datetime.datetime.now()
# Set intial variables and constants
# % config InlineBackend.figure_format='retina'

# Graph Designs
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv('reviews.csv')
df.shape

df.isnull().sum()

# Function to convert score to sentiment
def to_sentiment(rating):
    rating = int(rating)

    # Convert to class
    if rating <= 2:
        return 0
    elif rating == 3:
        return 1
    else:
        return 2


# Apply to the dataset
df['sentiment'] = df.score.apply(to_sentiment)
# Plot the distribution
class_names = ['negative', 'neutral', 'positive']
# print(df.sentiment)
# ax = sns.countplot(df.sentiment)
# plt.xlabel('review sentiment')
# ax.set_xticklabels(class_names)
print(len(df[df['sentiment']==0]))
print(len(df[df['sentiment']==1]))
print(len(df[df['sentiment']==2]))

# Set the model name
MODEL_NAME = 'bert-base-cased'

# Build a BERT based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Some of the common BERT tokens
print(tokenizer.sep_token, tokenizer.sep_token_id)  # marker for ending of a sentence
print(tokenizer.cls_token, tokenizer.cls_token_id)  # start of each sentence, so BERT knows we’re doing classification
print(tokenizer.pad_token, tokenizer.pad_token_id)  # special token for padding
print(tokenizer.unk_token, tokenizer.unk_token_id)  # tokens not found in training set

# Store length of each review
token_lens = []
# Iterate through the content slide
for txt in df.content:
    tokens = tokenizer.encode(txt, max_length=512)
    token_lens.append(len(tokens))

# plot the distribution of review lengths
sns.distplot(token_lens)
plt.xlim([0, 256])
plt.xlabel('Token count')

MAX_LEN = 160

class GPReviewDataset(Dataset):
    # Constructor Function
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    # Length magic method
    def __len__(self):
        return len(self.reviews)

    # get item magic method
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        # Encoded format to be returned
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)

print(df_train.shape, df_val.shape, df_test.shape)

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = GPReviewDataset(
        reviews=df.content.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )
# Create train, test and val data loaders
BATCH_SIZE = 32
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
# Examples
data = next(iter(train_data_loader))
print(data.keys())
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

# Load the basic BERT model
bert_model = BertModel.from_pretrained(MODEL_NAME)
# Build the Sentiment Classifier class
class SentimentClassifier(nn.Module):

    # Constructor class
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    # Forward propagaion class
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        #  Add a dropout layer
        output = self.drop(pooled_output)
        return self.out(output)
# Instantiate the model and move to classifier
model = SentimentClassifier(len(class_names))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
device

# Number of hidden units
print(bert_model.config.hidden_size)

# Number of iterations
EPOCHS = 10

# Optimizer Adam
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Set the loss function
loss_fn = nn.CrossEntropyLoss().to(device)

from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()


def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)


        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.zero_grad()

        with autocast():
            #一定要把模型设置在fp16模型中输出
            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()

    return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)


history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

    # Show details
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )

    print(f"Train loss {train_loss} accuracy {train_acc}")

    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )

    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    # If we beat prev performance
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc
# 获取过段时间后的时间
endtime = datetime.datetime.now()

print((endtime - starttime).seconds)

# 读取训练好的模型
model.load_state_dict(torch.load('best_model_state.bin'))

test_acc, _ = eval_model(
    model,
    test_data_loader,
    loss_fn,
    device,
    len(df_test)
)

test_acc.item()


def get_predictions(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get outouts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values


y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    test_data_loader
)

print(classification_report(y_test, y_pred, target_names=class_names))


def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');


cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

review_text = "I love completing my todos! Best app ever!!!"
encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)

input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')
```

------

## 代码讲解

### 环境准备

```python
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings

warnings.filterwarnings('ignore')
import datetime

# 获得计算机当前时间
starttime = datetime.datetime.now()
# Set intial variables and constants
# % config InlineBackend.figure_format='retina'

# Graph Designs
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
```

**关键知识点**：

1. Transformers库是HuggingFace提供的预训练模型工具包，研究生阶段会频繁使用
2. 混合精度训练（代码后部分的`GradScaler`）能显著提升训练速度，是大模型训练的必备技巧

------

### 🌟 Part 1: 数据准备与可视化

```python
df = pd.read_csv('reviews.csv')
df.shape  # 查看 DataFrame 的形状，即它包含多少行和列。

df.isnull().sum()  # 计算 DataFrame 中每列缺失值的数量。

# 将评分转换为情感类别
def to_sentiment(rating):
    rating = int(rating)  # 确保整数类型
    if rating <= 2: return 0   # 负面情绪 → 类比温度计的低温区
    elif rating ==3: return 1  # 中性 → 常温区
    else: return 2             # 正面 → 高温区

df['sentiment'] = df.score.apply(to_sentiment)  # 新增情感列，如同给数据贴标签

# 定义三个情感类标签 分别对应 0 1 2
class_names = ['negative', 'neutral', 'positive']
# print(df.sentiment)
# ax = sns.countplot(df.sentiment)
# plt.xlabel('review sentiment')
# ax.set_xticklabels(class_names)
print(len(df[df['sentiment']==0]))
print(len(df[df['sentiment']==1]))
print(len(df[df['sentiment']==2]))
```

**项目技巧**：

1. 类别不平衡处理：通过打印各类样本数，可考虑采用加权交叉熵损失
2. 实际研究中常需要设计更复杂的标注规则，比如考虑文本内容而不仅仅是评分
3. 可视化部分（sns.countplot）帮助快速理解数据分布，是论文图表的重要来源

代码中，`df` 是一个 **Pandas DataFrame** 对象，通常用于处理和分析结构化数据（比如从 CSV、Excel 或数据库中加载的数据）。

---

🔍 举个例子说明 `df` 是什么：

假设你有如下数据：

| user_id | score |
| :-----: | :---: |
|    1    |   5   |
|    2    |   3   |
|    3    |   1   |

当你用 Pandas 把这些数据加载进来时，它们会被存储在一个 `DataFrame` 中，你可以这样创建它：

```python
import pandas as pd

data = {'user_id': [1, 2, 3], 'score': [5, 3, 1]}
df = pd.DataFrame(data)
```

此时，变量 `df` 就是你的数据表，可以理解为 Python 中的“表格”。

---

🧠 回到问题：`df` 在这段代码里做什么？

```python
df['sentiment'] = df.score.apply(to_sentiment)
```

这行代码的作用是：

- 遍历 `df` 表格中的 `score` 列；
- 对每一行应用函数 `to_sentiment()`，将评分转换为情感类别（0、1、2）；
- 然后把结果保存到一个新的列 `'sentiment'` 中。

**✅ 示例效果：**

执行完上面的代码后，`df` 变成这样：

| user_id | score | sentiment |
| :-----: | :---: | :-------: |
|    1    |   5   |     2     |
|    2    |   3   |     1     |
|    3    |   1   |     0     |

---

**解释len(df[df['sentiment']==0])**

1. 内层 df['sentiment']==0 ：

- 生成的是一个布尔Series（True/False数组）
- 长度与原DataFrame行数相同
- 示例：[True, False, True, False,...]

2. 外层 df[布尔Series] ：

- pandas的筛选机制
- 只保留布尔Series中对应True的行
- 相当于SQL的WHERE过滤

3. len() 统计：

- 计算的是过滤后剩余的行数
- 每个True代表一个匹配项
- 因此结果是sentiment=0的记录总数

📌 总结：

`df` 是一个 Pandas 的 DataFrame，表示一个表格型数据结构。

apply 方法负责遍历 df.score 中的每一个评分值，并将其依次传递给 to_sentiment 函数作为参数 rating

在这段代码中，df 用于存储带有评分（`score`）的数据，并新增一列 `sentiment` 来表示每个评分对应的情感类别。

### 🔍 Part 2：BERT预处理

```python
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 160

class GPReviewDataset(Dataset):
    def __getitem__(self, item):
        encoding = tokenizer.encode_plus(
            review,
            add_special_tokens=True,   # 添加[CLS]和[SEP]，如同给句子加书签
            max_length=MAX_LEN,        # 设置句子最大长度，类似相框尺寸
            pad_to_max_length=True,    # 填充短句，像用泡沫填充快递箱
            return_attention_mask=True # 标识哪些是真实内容，哪些是填充物
            return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }
```

**核心知识**：

1. `[CLS]`标记用于分类任务，`[SEP]`用于分隔句子
2. Attention Mask告诉模型哪些位置是真实token（1），哪些是填充的（0）
3. 动态填充（动态设置MAX_LEN）是优化显存使用的进阶技巧
4. 实际项目中可能需要处理特殊领域词汇，此时需要自定义词典

------

### 第四部分：模型架构

```python
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(bert_model.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False)
        return self.out(self.drop(pooled_output))
```

**深度理解**：

1. Pooled_output是`[CLS]`标记对应的隐藏状态，经过线性层和Tanh激活处理
2. Dropout概率设置为0.3是经验值，实际项目需要通过实验确定
3. 冻结BERT参数（`param.requires_grad = False`）是迁移学习的常用技巧
4. 隐藏层维度768来自BERT-base的设计，large版本为1024

------

### 第五部分：训练优化

```python
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(...)
loss_fn = nn.CrossEntropyLoss().to(device)

# 混合精度训练
scaler = GradScaler()
with autocast():
    outputs = model(input_ids, attention_mask)
    loss = loss_fn(outputs, targets)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
```

**工程实践**：

1. 学习率2e-5是微调BERT的典型值，预训练时会更小（~5e-5）
2. Warmup策略帮助稳定训练初期的大梯度更新
3. 梯度裁剪（`clip_grad_norm_`）防止梯度爆炸
4. 混合精度训练节省显存的同时保持精度，需搭配`GradScaler`

------

### 第六部分：评估与部署

```python
def eval_model(...):
    with torch.no_grad():
        # 禁用梯度计算，提升推理速度

# 模型保存
torch.save(model.state_dict(), 'best_model_state.bin')

# 推理示例
encoded_review = tokenizer.encode_plus(...)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
```

**项目技巧**：

1. 使用`torch.no_grad()`可减少20%-30%的内存占用
2. 模型保存应包含完整状态（模型+优化器）以便恢复训练
3. 实际部署时需考虑将预处理和推理封装为API服务

------

### 研究生阶段必备扩展知识

1. **领域自适应**：通过继续预训练使BERT适应特定领域（医疗/金融）
2. **模型蒸馏**：将大模型压缩为小模型便于部署
3. **多模态融合**：结合文本与图像/结构化数据
4. **可解释性分析**：使用LIME/SHAP解释模型预测
5. **高效训练技巧**：参数冻结、梯度累积、分布式训练

建议后续实践方向：

1. 尝试不同的预训练模型（RoBERTa、ALBERT）
2. 实现早停（Early Stopping）机制
3. 添加层选择（提取不同层的特征）
4. 进行超参数搜索（学习率、batch size）