2 Star 0 Fork 0

邓龙 / Gaokao-Application

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
parse_data.py 3.22 KB
一键复制 编辑 原始数据 按行查看 历史
import os
import pandas as pd
from typing import Mapping
from utils import *
pd.set_option("display.unicode.east_asian_width", True) # print时对齐中文字符
os.makedirs(DIR_ADMISSION_SCORE_DATA, exist_ok=True)
os.makedirs(DIR_SCORE_RANK_TABLE_DATA, exist_ok=True)
def filter_columns(df, cols_map: Mapping[str, str]):
col_idxs = []
# 对每一列,判断当前需要列的可能列名是否在这一列中,若在,则该列为当前需要列
for col_name, possible_names in cols_map.items():
candidates = table.isin(possible_names).any()
candidates = candidates.index[candidates] # Bool数组转为数字
if len(candidates) == 0:
print(df)
raise ValueError(f'解析失败,未找到列“{col_name}”')
elif len(candidates) > 1:
raise ValueError(f'解析失败,列“{col_name}”重复,位置{candidates}')
col_idxs.append(candidates[0])
# print(f'找到所需列{dict(zip(cols_map.keys(), col_idxs))}')
return df.iloc[:, col_idxs]
def parse_admission_file_name(file_name):
"""解析投档线文件名,返回年份和批次
"""
year = int(file_name[:4])
for batch_name, batch in BATCH_NAMES.items():
if batch_name in file_name:
return year, batch
# 生成投档线表
print('生成投档线表:')
for file_name in os.listdir(DIR_ADMISSION_SCORE_PAGES):
print(f'正在处理文件 “{file_name}” ...')
page_path = os.path.join(DIR_ADMISSION_SCORE_PAGES, file_name)
table = pd.read_html(page_path)[0]
# 处理投档线表
# 找到需要的列
table = filter_columns(table, ADMISSION_COLUMNS_NAMES)
# 找到所有第一列(院校代号)为数字的行,这些行是有效数据
data_rows = table.iloc[:, 0].apply(lambda x: x.isdecimal())
table = table[data_rows].iloc[:, :4]
table.columns = ADMISSION_COLUMNS_NAMES.keys()
table.reset_index(drop=True, inplace=True)
# 输出表到csv及xlsx文件
year, batch = parse_admission_file_name(file_name)
data_path = admission_data_path(year, batch)
table.to_csv(data_path, index=False)
data_path = admission_data_path(year, batch, '.xlsx')
table.to_excel(data_path, index=False)
# 生成一分一档表
print('生成一分一档表:')
for file_name in os.listdir(DIR_SCORE_RANK_TABLE_PAGES):
print(f'正在处理文件 “{file_name}” ...')
page_path = os.path.join(DIR_SCORE_RANK_TABLE_PAGES, file_name)
table = pd.read_html(page_path)[0]
# 处理一分一档表
# 找到需要的列
table = filter_columns(table, SCORE_RANK_COLUMNS_NAMES)
# 找到所有第一列(分值)为数字的行,这些行是有效数据
data_rows = table.iloc[:, 0].apply(lambda x: x.isdecimal())
table = table[data_rows].iloc[:, :4]
table.columns = ['分值', '人数', '累计人数', '名次']
table.reset_index(drop=True, inplace=True)
# 输出表到csv及xlsx文件
year = file_name[:4]
subject = 'science' if '理工类' in file_name else 'arts'
data_path = score_rank_data_path(year, subject)
table.to_csv(data_path, index=False)
data_path = score_rank_data_path(year, subject, '.xlsx')
table.to_excel(data_path, index=False)
1
https://gitee.com/ldeng1997/gaokao-application.git
git@gitee.com:ldeng1997/gaokao-application.git
ldeng1997
gaokao-application
Gaokao-Application
master

搜索帮助