3 Star 0 Fork 1

coda / 朴素贝叶斯预测kaggle泰坦尼克号

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
kaggle_NB.py 6.31 KB
一键复制 编辑 原始数据 按行查看 历史
import numpy as np
import ReadCSV
import math
def calculate_before_prop(dataMat): #计算前验概率 1 和 0的标签
total = len(dataMat)
labelMat = [x[0] for x in dataMat]
trueNum = labelMat.count(1)
falseNum = labelMat.count(0)
return float(trueNum/total),float(falseNum/total) #计算得前验概率
def separate_classes(dataMat): #分离两种类型的数据,此处用的是2分类 1 -1
class1 = []
class2 = []
for i in dataMat:
if(i[0]==1):
class1.append(i)
else:
class2.append(i)
return class1,class2
def calculate_col_prop(dataMat,n): #dataMat 是分割后的数据集 n 指某一个列 这里将指定列 这个方法主要是在意离散列的概率
data = [x[n] for x in dataMat]
labelLib = {}
for i in dataMat:
feat = i[n]
if(feat == ''): #空的数据直接无视
continue
if feat not in labelLib.keys():
labelLib[feat] = 0
labelLib[feat]+= 1
else:
labelLib[feat]+=1
return labelLib
def calculate_disperse_total(dataMat):
#离散的数据列为 1 2 -1
disperse = []
disperse.append(calculate_col_prop(dataMat, 1))
disperse.append(calculate_col_prop(dataMat,2))
disperse.append(calculate_col_prop(dataMat, -1))
return disperse
def calculate_disperse_prop(dictList): #这里自动化太麻烦 直接指定了
classTotal = sum(tuple(dictList[0].values()))
sexTotal = sum(tuple(dictList[1].values()))
csqTotal = sum(tuple(dictList[-1].values()))
dictList[0][1] = dictList[0][1] / classTotal
dictList[0][3] = dictList[0][3] / classTotal
dictList[0][2] = dictList[0][2] / classTotal #数量转换为概率 不使用平滑
dictList[1]['female'] = dictList[1]['female'] / sexTotal
dictList[1]['male'] = dictList[1]['male'] / sexTotal
dictList[2]['C'] = dictList[2]['C'] / csqTotal
dictList[2]['S'] = dictList[2]['S'] / csqTotal
dictList[2]['Q'] = dictList[2]['Q'] / csqTotal
#离散概率处理完成
return dictList
def calculate_average_variance(dataMat,n):#计算指定列的均值&方差
temp = []
data = [x[n] for x in dataMat] #获取某一列的数据
aveTotal = 0
for j in data:
aveTotal+=j
temp.append(float(aveTotal/len(data))) #第一部分的均值GET
#接下来是计算方差
varTotal = 0
for j in data:
varTotal+=math.pow(j-temp[0],2)
temp.append(float(varTotal/len(data)))
return temp
def calculate_continuous_pro(dataMat):
#标签0 离散1 离散2 连续3 连续4 连续5 连续6 离散7
result = []
result.append(calculate_average_variance(dataMat, 3))
result.append(calculate_average_variance(dataMat, 4))
result.append(calculate_average_variance(dataMat, 5))
result.append(calculate_average_variance(dataMat, 6))
return result
def calculate_row_rate(n,avList,x): #计算该特征列下某个值的概率,应用的是高斯模型
'''
n指的是第几个特征也就是第几列 从0开始
avList指的是 average 和 variance的列表 在上面的方法已经算出来 直接用位置制定调用
x 是要预测的某个特征的具体的值
'''
if(avList[n][0]==0 or avList[n][-1]==0):
return 1
return float(1/math.sqrt(2*math.pi*avList[n][-1])) * math.exp(float(-math.pow(x-avList[n][0],2) / (2*avList[n][-1])))
def testNB(testData,truePro,falsePro,trueDisperseProList,falseDisperseProList,trueContinuousProList,falseContinuousProList):
trueTotal = 0
PredictList = []
for i in testData:
trueTotalPro = truePro
falseTotalPro = falsePro
predict = 0
j = 0
flag = 0
while(j < 8):
if(i[j] == ''):
flag = 1
j+=1
if(flag == 1):
continue
j = 0
while(j < 8):
if(j == 0):
j+=1
continue
trueTotalPro = truePro * trueDisperseProList[0][i[1]] * trueDisperseProList[1][i[2]] * trueDisperseProList[2][i[-1]]
trueTotalPro = trueTotalPro * calculate_row_rate(0, trueContinuousProList,i[3])
trueTotalPro = trueTotalPro * calculate_row_rate(1, trueContinuousProList, i[4])
trueTotalPro = trueTotalPro * calculate_row_rate(2, trueContinuousProList, i[5])
trueTotalPro = trueTotalPro * calculate_row_rate(3, trueContinuousProList, i[6])
falseTotalPro = falsePro * falseDisperseProList[0][i[1]] * falseDisperseProList[1][i[2]] * falseDisperseProList[2][i[-1]]
falseTotalPro = falseTotalPro * calculate_row_rate(0, falseContinuousProList, i[3])
falseTotalPro = falseTotalPro * calculate_row_rate(1, falseContinuousProList, i[4])
falseTotalPro = falseTotalPro * calculate_row_rate(2, falseContinuousProList, i[5])
falseTotalPro = falseTotalPro * calculate_row_rate(3, falseContinuousProList, i[6])
j+=1
if(trueTotalPro > falseTotalPro):
predict = 1
PredictList.append(predict)
return PredictList
trainData = ReadCSV.getData('train.csv') #载入训练数据
#print(trainData)
truePro ,falsePro = calculate_before_prop(trainData) #计算得前验概率
print(truePro)
print(falsePro)
#前验概率检查正常
trueData,falseData = separate_classes(trainData) #分离两种数据
trueDisperseList = calculate_disperse_total(trueData)
falseDisperseList = calculate_disperse_total(falseData)
#离散数据字典获得完成
#print(trueDisperseList)
#print(falseDisperseList)
trueDisperseProList = calculate_disperse_prop(trueDisperseList)
falseDisperseProList = calculate_disperse_prop(falseDisperseList)
#print(trueDisperseProList[0][a[0]])
#接下来处理连续性数据
trueContinuousProList = calculate_continuous_pro(trueData)
falseContinuousProList = calculate_continuous_pro(falseData)
#print(trueContinuousProList)
#print(falseContinuousProList)
#最后测试
testData = ReadCSV.getTestData('test.csv')
PredictList = testNB(testData,truePro,falsePro,trueDisperseProList,falseDisperseProList,trueContinuousProList,falseContinuousProList)
print('预测结果为 :' ,PredictList)
Python
1
https://gitee.com/coda469993172/PuSuBeiYeSiYuCekaggleTaiTanNiKeHao.git
git@gitee.com:coda469993172/PuSuBeiYeSiYuCekaggleTaiTanNiKeHao.git
coda469993172
PuSuBeiYeSiYuCekaggleTaiTanNiKeHao
朴素贝叶斯预测kaggle泰坦尼克号
master

搜索帮助