# Bert英文文本分类练习 **Repository Path**: AwakeBo/BERT_english-text-classification-exercises ## Basic Information - **Project Name**: Bert英文文本分类练习 - **Description**: Bert练手项目 - **Primary Language**: Unknown - **License**: Not specified - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 1 - **Created**: 2025-05-23 - **Last Updated**: 2025-07-30 ## Categories & Tags **Categories**: Uncategorized **Tags**: bert ## README # Bert英文文本分类 ## 完整代码 ```python # Import necessary libraries import numpy as np import pandas as pd import seaborn as sns from pylab import rcParams import matplotlib.pyplot as plt from matplotlib import rc from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from collections import defaultdict from textwrap import wrap # Torch ML libraries import transformers from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup import torch from torch import nn, optim from torch.utils.data import Dataset, DataLoader # Misc. import warnings warnings.filterwarnings('ignore') import datetime # 获得计算机当前时间 starttime = datetime.datetime.now() # Set intial variables and constants # % config InlineBackend.figure_format='retina' # Graph Designs sns.set(style='whitegrid', palette='muted', font_scale=1.2) HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"] sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE)) rcParams['figure.figsize'] = 12, 8 # Random seed for reproducibilty RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) # Set GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") df = pd.read_csv('reviews.csv') df.shape df.isnull().sum() # Function to convert score to sentiment def to_sentiment(rating): rating = int(rating) # Convert to class if rating <= 2: return 0 elif rating == 3: return 1 else: return 2 # Apply to the dataset df['sentiment'] = df.score.apply(to_sentiment) # Plot the distribution class_names = ['negative', 'neutral', 'positive'] # print(df.sentiment) # ax = sns.countplot(df.sentiment) # plt.xlabel('review sentiment') # ax.set_xticklabels(class_names) print(len(df[df['sentiment']==0])) print(len(df[df['sentiment']==1])) print(len(df[df['sentiment']==2])) # Set the model name MODEL_NAME = 'bert-base-cased' # Build a BERT based tokenizer tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) # Some of the common BERT tokens print(tokenizer.sep_token, tokenizer.sep_token_id) # marker for ending of a sentence print(tokenizer.cls_token, tokenizer.cls_token_id) # start of each sentence, so BERT knows we’re doing classification print(tokenizer.pad_token, tokenizer.pad_token_id) # special token for padding print(tokenizer.unk_token, tokenizer.unk_token_id) # tokens not found in training set # Store length of each review token_lens = [] # Iterate through the content slide for txt in df.content: tokens = tokenizer.encode(txt, max_length=512) token_lens.append(len(tokens)) # plot the distribution of review lengths sns.distplot(token_lens) plt.xlim([0, 256]) plt.xlabel('Token count') MAX_LEN = 160 class GPReviewDataset(Dataset): # Constructor Function def __init__(self, reviews, targets, tokenizer, max_len): self.reviews = reviews self.targets = targets self.tokenizer = tokenizer self.max_len = max_len # Length magic method def __len__(self): return len(self.reviews) # get item magic method def __getitem__(self, item): review = str(self.reviews[item]) target = self.targets[item] # Encoded format to be returned encoding = self.tokenizer.encode_plus( review, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) return { 'review_text': review, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'targets': torch.tensor(target, dtype=torch.long) } df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42) print(df_train.shape, df_val.shape, df_test.shape) def create_data_loader(df, tokenizer, max_len, batch_size): ds = GPReviewDataset( reviews=df.content.to_numpy(), targets=df.sentiment.to_numpy(), tokenizer=tokenizer, max_len=max_len ) return DataLoader( ds, batch_size=batch_size, num_workers=0 ) # Create train, test and val data loaders BATCH_SIZE = 32 train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE) val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE) test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE) # Examples data = next(iter(train_data_loader)) print(data.keys()) print(data['input_ids'].shape) print(data['attention_mask'].shape) print(data['targets'].shape) # Load the basic BERT model bert_model = BertModel.from_pretrained(MODEL_NAME) # Build the Sentiment Classifier class class SentimentClassifier(nn.Module): # Constructor class def __init__(self, n_classes): super(SentimentClassifier, self).__init__() self.bert = BertModel.from_pretrained(MODEL_NAME) self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(self.bert.config.hidden_size, n_classes) # Forward propagaion class def forward(self, input_ids, attention_mask): _, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask, return_dict=False ) # Add a dropout layer output = self.drop(pooled_output) return self.out(output) # Instantiate the model and move to classifier model = SentimentClassifier(len(class_names)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) device # Number of hidden units print(bert_model.config.hidden_size) # Number of iterations EPOCHS = 10 # Optimizer Adam optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(train_data_loader) * EPOCHS scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) # Set the loss function loss_fn = nn.CrossEntropyLoss().to(device) from torch.cuda.amp import GradScaler, autocast scaler = GradScaler() def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): model = model.train() losses = [] correct_predictions = 0 for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) # Gradient Descent nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.zero_grad() with autocast(): #一定要把模型设置在fp16模型中输出 outputs = model( input_ids=input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) # Scales loss. Calls backward() on scaled loss to create scaled gradients. # Backward passes under autocast are not recommended. # Backward ops run in the same dtype autocast chose for corresponding forward ops. scaler.scale(loss).backward() # scaler.step() first unscales the gradients of the optimizer's assigned params. # If these gradients do not contain infs or NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() return correct_predictions.double() / n_examples, np.mean(losses) def eval_model(model, data_loader, loss_fn, device, n_examples): model = model.eval() losses = [] correct_predictions = 0 with torch.no_grad(): for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) # Get model ouptuts outputs = model( input_ids=input_ids, attention_mask=attention_mask ) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) return correct_predictions.double() / n_examples, np.mean(losses) history = defaultdict(list) best_accuracy = 0 for epoch in range(EPOCHS): # Show details print(f"Epoch {epoch + 1}/{EPOCHS}") print("-" * 10) train_acc, train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train) ) print(f"Train loss {train_loss} accuracy {train_acc}") # Get model performance (accuracy and loss) val_acc, val_loss = eval_model( model, val_data_loader, loss_fn, device, len(df_val) ) print(f"Val loss {val_loss} accuracy {val_acc}") print() history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) # If we beat prev performance if val_acc > best_accuracy: torch.save(model.state_dict(), 'best_model_state.bin') best_accuracy = val_acc # 获取过段时间后的时间 endtime = datetime.datetime.now() print((endtime - starttime).seconds) # 读取训练好的模型 model.load_state_dict(torch.load('best_model_state.bin')) test_acc, _ = eval_model( model, test_data_loader, loss_fn, device, len(df_test) ) test_acc.item() def get_predictions(model, data_loader): model = model.eval() review_texts = [] predictions = [] prediction_probs = [] real_values = [] with torch.no_grad(): for d in data_loader: texts = d["review_text"] input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) # Get outouts outputs = model( input_ids=input_ids, attention_mask=attention_mask ) _, preds = torch.max(outputs, dim=1) review_texts.extend(texts) predictions.extend(preds) prediction_probs.extend(outputs) real_values.extend(targets) predictions = torch.stack(predictions).cpu() prediction_probs = torch.stack(prediction_probs).cpu() real_values = torch.stack(real_values).cpu() return review_texts, predictions, prediction_probs, real_values y_review_texts, y_pred, y_pred_probs, y_test = get_predictions( model, test_data_loader ) print(classification_report(y_test, y_pred, target_names=class_names)) def show_confusion_matrix(confusion_matrix): hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues") hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right') hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right') plt.ylabel('True sentiment') plt.xlabel('Predicted sentiment'); cm = confusion_matrix(y_test, y_pred) df_cm = pd.DataFrame(cm, index=class_names, columns=class_names) show_confusion_matrix(df_cm) review_text = "I love completing my todos! Best app ever!!!" encoded_review = tokenizer.encode_plus( review_text, max_length=MAX_LEN, add_special_tokens=True, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', ) input_ids = encoded_review['input_ids'].to(device) attention_mask = encoded_review['attention_mask'].to(device) output = model(input_ids, attention_mask) _, prediction = torch.max(output, dim=1) print(f'Review text: {review_text}') print(f'Sentiment : {class_names[prediction]}') ``` ------ ## 代码讲解 ### 环境准备 ```python # Import necessary libraries import numpy as np import pandas as pd import seaborn as sns from pylab import rcParams import matplotlib.pyplot as plt from matplotlib import rc from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from collections import defaultdict from textwrap import wrap # Torch ML libraries import transformers from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup import torch from torch import nn, optim from torch.utils.data import Dataset, DataLoader # Misc. import warnings warnings.filterwarnings('ignore') import datetime # 获得计算机当前时间 starttime = datetime.datetime.now() # Set intial variables and constants # % config InlineBackend.figure_format='retina' # Graph Designs sns.set(style='whitegrid', palette='muted', font_scale=1.2) HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"] sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE)) rcParams['figure.figsize'] = 12, 8 # Random seed for reproducibilty RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) # Set GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ``` **关键知识点**: 1. Transformers库是HuggingFace提供的预训练模型工具包,研究生阶段会频繁使用 2. 混合精度训练(代码后部分的`GradScaler`)能显著提升训练速度,是大模型训练的必备技巧 ------ ### 🌟 Part 1: 数据准备与可视化 ```python df = pd.read_csv('reviews.csv') df.shape # 查看 DataFrame 的形状,即它包含多少行和列。 df.isnull().sum() # 计算 DataFrame 中每列缺失值的数量。 # 将评分转换为情感类别 def to_sentiment(rating): rating = int(rating) # 确保整数类型 if rating <= 2: return 0 # 负面情绪 → 类比温度计的低温区 elif rating ==3: return 1 # 中性 → 常温区 else: return 2 # 正面 → 高温区 df['sentiment'] = df.score.apply(to_sentiment) # 新增情感列,如同给数据贴标签 # 定义三个情感类标签 分别对应 0 1 2 class_names = ['negative', 'neutral', 'positive'] # print(df.sentiment) # ax = sns.countplot(df.sentiment) # plt.xlabel('review sentiment') # ax.set_xticklabels(class_names) print(len(df[df['sentiment']==0])) print(len(df[df['sentiment']==1])) print(len(df[df['sentiment']==2])) ``` **项目技巧**: 1. 类别不平衡处理:通过打印各类样本数,可考虑采用加权交叉熵损失 2. 实际研究中常需要设计更复杂的标注规则,比如考虑文本内容而不仅仅是评分 3. 可视化部分(sns.countplot)帮助快速理解数据分布,是论文图表的重要来源 代码中,`df` 是一个 **Pandas DataFrame** 对象,通常用于处理和分析结构化数据(比如从 CSV、Excel 或数据库中加载的数据)。 --- 🔍 举个例子说明 `df` 是什么: 假设你有如下数据: | user_id | score | | :-----: | :---: | | 1 | 5 | | 2 | 3 | | 3 | 1 | 当你用 Pandas 把这些数据加载进来时,它们会被存储在一个 `DataFrame` 中,你可以这样创建它: ```python import pandas as pd data = {'user_id': [1, 2, 3], 'score': [5, 3, 1]} df = pd.DataFrame(data) ``` 此时,变量 `df` 就是你的数据表,可以理解为 Python 中的“表格”。 --- 🧠 回到问题:`df` 在这段代码里做什么? ```python df['sentiment'] = df.score.apply(to_sentiment) ``` 这行代码的作用是: - 遍历 `df` 表格中的 `score` 列; - 对每一行应用函数 `to_sentiment()`,将评分转换为情感类别(0、1、2); - 然后把结果保存到一个新的列 `'sentiment'` 中。 **✅ 示例效果:** 执行完上面的代码后,`df` 变成这样: | user_id | score | sentiment | | :-----: | :---: | :-------: | | 1 | 5 | 2 | | 2 | 3 | 1 | | 3 | 1 | 0 | --- **解释len(df[df['sentiment']==0])** 1. 内层 df['sentiment']==0 : - 生成的是一个布尔Series(True/False数组) - 长度与原DataFrame行数相同 - 示例:[True, False, True, False,...] 2. 外层 df[布尔Series] : - pandas的筛选机制 - 只保留布尔Series中对应True的行 - 相当于SQL的WHERE过滤 3. len() 统计: - 计算的是过滤后剩余的行数 - 每个True代表一个匹配项 - 因此结果是sentiment=0的记录总数 📌 总结: `df` 是一个 Pandas 的 DataFrame,表示一个表格型数据结构。 apply 方法负责遍历 df.score 中的每一个评分值,并将其依次传递给 to_sentiment 函数作为参数 rating 在这段代码中,df 用于存储带有评分(`score`)的数据,并新增一列 `sentiment` 来表示每个评分对应的情感类别。 ### 🔍 Part 2:BERT预处理 ```python tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) MAX_LEN = 160 class GPReviewDataset(Dataset): def __getitem__(self, item): encoding = tokenizer.encode_plus( review, add_special_tokens=True, # 添加[CLS]和[SEP],如同给句子加书签 max_length=MAX_LEN, # 设置句子最大长度,类似相框尺寸 pad_to_max_length=True, # 填充短句,像用泡沫填充快递箱 return_attention_mask=True # 标识哪些是真实内容,哪些是填充物 return_tensors='pt') return { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'targets': torch.tensor(target, dtype=torch.long) } ``` **核心知识**: 1. `[CLS]`标记用于分类任务,`[SEP]`用于分隔句子 2. Attention Mask告诉模型哪些位置是真实token(1),哪些是填充的(0) 3. 动态填充(动态设置MAX_LEN)是优化显存使用的进阶技巧 4. 实际项目中可能需要处理特殊领域词汇,此时需要自定义词典 ------ ### 第四部分:模型架构 ```python class SentimentClassifier(nn.Module): def __init__(self, n_classes): super().__init__() self.bert = BertModel.from_pretrained(MODEL_NAME) self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(bert_model.config.hidden_size, n_classes) def forward(self, input_ids, attention_mask): _, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask, return_dict=False) return self.out(self.drop(pooled_output)) ``` **深度理解**: 1. Pooled_output是`[CLS]`标记对应的隐藏状态,经过线性层和Tanh激活处理 2. Dropout概率设置为0.3是经验值,实际项目需要通过实验确定 3. 冻结BERT参数(`param.requires_grad = False`)是迁移学习的常用技巧 4. 隐藏层维度768来自BERT-base的设计,large版本为1024 ------ ### 第五部分:训练优化 ```python optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) scheduler = get_linear_schedule_with_warmup(...) loss_fn = nn.CrossEntropyLoss().to(device) # 混合精度训练 scaler = GradScaler() with autocast(): outputs = model(input_ids, attention_mask) loss = loss_fn(outputs, targets) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() ``` **工程实践**: 1. 学习率2e-5是微调BERT的典型值,预训练时会更小(~5e-5) 2. Warmup策略帮助稳定训练初期的大梯度更新 3. 梯度裁剪(`clip_grad_norm_`)防止梯度爆炸 4. 混合精度训练节省显存的同时保持精度,需搭配`GradScaler` ------ ### 第六部分:评估与部署 ```python def eval_model(...): with torch.no_grad(): # 禁用梯度计算,提升推理速度 # 模型保存 torch.save(model.state_dict(), 'best_model_state.bin') # 推理示例 encoded_review = tokenizer.encode_plus(...) output = model(input_ids, attention_mask) _, prediction = torch.max(output, dim=1) ``` **项目技巧**: 1. 使用`torch.no_grad()`可减少20%-30%的内存占用 2. 模型保存应包含完整状态(模型+优化器)以便恢复训练 3. 实际部署时需考虑将预处理和推理封装为API服务 ------ ### 研究生阶段必备扩展知识 1. **领域自适应**:通过继续预训练使BERT适应特定领域(医疗/金融) 2. **模型蒸馏**:将大模型压缩为小模型便于部署 3. **多模态融合**:结合文本与图像/结构化数据 4. **可解释性分析**:使用LIME/SHAP解释模型预测 5. **高效训练技巧**:参数冻结、梯度累积、分布式训练 建议后续实践方向: 1. 尝试不同的预训练模型(RoBERTa、ALBERT) 2. 实现早停(Early Stopping)机制 3. 添加层选择(提取不同层的特征) 4. 进行超参数搜索(学习率、batch size)