3 Star 58 Fork 12

wyz / 电影推荐系统

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
基于内容的推荐.py 9.42 KB
一键复制 编辑 原始数据 按行查看 历史
wyz 提交于 2020-05-27 21:52 . final
import numpy as np
import pickle
import redis
import math
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from redis import StrictRedis
import random
file_name = "./movielens/ratings.dat"
ratings_file = "./movielens/ratings.dat"
movie_file = "./movielens/movies.dat"
## 连接redis
pool = redis.ConnectionPool(host='127.0.0.1', db=2)
redis = redis.StrictRedis(connection_pool=pool, decode_responses=True)
###################
###################
#### 正式开始 ######
###################
###################
## read movie info from "movies.dat"
## in order to find attrs in terms of movie_id
##
## return {movie:[attr1, attr2,]}
## in redis: "movie_info"
def get_movie_info(file_name):
if redis.exists("movie_info"):
return pickle.loads(redis.get("movie_info"))
res = {}
with open(file_name,'r',encoding="utf-8") as f:
while True:
line = f.readline()
if not line:
break
record = line.strip().split("::")
movie_id = int(record[0])
attrs = record[-1].split("|")
res[movie_id] = attrs
redis.set("movie_info", pickle.dumps(res))
return res
## find movie's attrs in terms of movie_id
def get_movie_attrs(movie_info:dict, movie_id:int):
if movie_id in movie_info:
return movie_info[movie_id]
else:
print("no info of this movie ")
return []
## read record from "ratings.dat"
## split into "train data" & "test data"
##
## records: [[user, movie, score],]
##
## return train_record, test_record
## "train_record" & "test_record" in redis
def get_split_ratings(file_name, ratio=0.98):
if redis.exists("train_record") and redis.exists("test_record"):
return pickle.loads(redis.get("train_record")), pickle.loads(redis.get("test_record"))
train_record = []
test_record = []
with open(file_name,'r') as f:
while True:
line = f.readline()
if not line:
break
record = line.strip().split("::")[:-1] #最后的时间戳不需要
i = random.random()
if i < ratio:
train_record.append(record)
else:
test_record.append(record)
# 放入redis
redis.set("train_record", pickle.dumps(train_record))
redis.set("test_record", pickle.dumps(test_record))
return train_record, test_record
## generate user's preference VECTOR for movies
##
## innovation: we can only use the movie whose score is higher than average
##
## records: [[user, movie, score],]
##
## return {user:{attr1:count, attr2:count}}
##
## in redis: "user_preference"
def get_user_preference(movie_info, records):
if redis.exists("user_preference"):
return pickle.loads(redis.get("user_preference"))
user_prefer = {}
for record in records:
user_id = int(record[0])
movie_id = int(record[1])
# rating = int(record[2])
if user_id not in user_prefer:
user_prefer[user_id] = {}
the_movie_attrs = movie_info[movie_id]
for attr in the_movie_attrs:
if attr not in user_prefer[user_id]:
user_prefer[user_id][attr] = 0
user_prefer[user_id][attr] += 1
redis.set("user_preference", pickle.dumps(user_prefer))
return user_prefer
## build User-Movie dict
##
## type:"train" or "test" : to indicate what the usage of data
##
## return {user:{movie:score}}
## "train_dict" & "test_dict" in redis
def get_dict(records:list, type:str):
# 先看看redis里面有没有
if redis.exists(type+'_dict'):
return pickle.loads(redis.get(type+'_dict'))
user_movie = {}
for record in records:
user_id = int(record[0])
movie_id = int(record[1])
rating = int(record[2])
if user_id not in user_movie:
user_movie[user_id]={}
user_movie[user_id][movie_id] = rating
# 中心化打分
for u, movies in user_movie.items():
sum = 0
for movie, score in movies.items():
sum += score
mean = sum/len(movies)
for movie in movies.keys():
user_movie[u][movie] -= mean
# 放进redis
redis.set(type+'_dict',pickle.dumps(user_movie))
return user_movie
## 计算一个 电影 和 用户偏好 的余弦相似度
def predict_preference(user_prefer, user_id, movie_info, movie_id):
user_preference = user_prefer[user_id]
movie_attrs = movie_info[movie_id]
# print("该用户的偏好向量:",user_preference)
# print("该电影的向量:",movie_attrs)
upper = 0
bottom_1 = np.sqrt(len(movie_attrs))
bottom_2 = 0
for freq in user_preference.values():
bottom_2 += freq*freq
bottom_2 = np.sqrt(bottom_2)
for attr in movie_attrs:
if attr in user_preference:
upper += user_preference[attr]
return upper/(bottom_1*bottom_2)
## 构建 Movie-User 倒排表
##
## return {movie:{user1,user2,}} using "set"
## "reverse_dict" in redis
def get_movie_user_dict(records:list)->dict:
## 先看看redis里面有没有
if redis.exists("reverse_dict"):
return pickle.loads(redis.get('reverse_dict'))
movie_user = {}
for record in records:
user_id = int(record[0])
movie_id = int(record[1])
if movie_id not in movie_user:
movie_user[movie_id] = set()
movie_user[movie_id].add(user_id)
# 放进redis
redis.set('reverse_dict',pickle.dumps(movie_user))
return movie_user
## 根据现有的推荐列表过滤出更好的推荐列表
def filter(movie_info:dict, recommended:list, train_x:list, train_y:list, n:int):
clf = LogisticRegression(random_state=0).fit(train_x, train_y)
test_x = []
for movie_id in recommended:
test_x.append(get_movie_attrs(movie_info, movie_id))
res = clf.predict_proba(test_x)[0][1]
container = []
for i in range(len(recommended)):
container.append((recommended[i], res[i][1]))
sorted(container, key=lambda x:x[1])
return container
## 输入用户id, 进行推荐
## n可以调整推荐结果的长度, 如果小于0, 就不限制
def recommend(movie_info:dict, user_preference:dict, train_dict:dict, user_id:int, n:int)->dict:
rank={}
for movie_id, attrs in movie_info.items():
if movie_id in train_dict[user_id]:
continue
ans = predict_preference(user_preference, user_id, movie_info, movie_id)
if ans>=0.5:
rank[movie_id] = ans
# 不限制推荐结果的长度
if n < 0:
return rank
# 限制长度
tmp = sorted(rank.items(), key=lambda x:x[1], reverse=True)[:n]
_rank = {}
for movie, score in tmp:
_rank[movie] = score
return _rank
# recall, precision, coverage, popularity
def evaluate(movie_info:dict, user_preference:dict, train_dict:dict, test_dict:dict,reverse_dict:dict, n:int)->float:
hit = 0
all_reality = 0
all_prediction = 0
pop = 0
recommended = set()
all_movies = set()
for u, reality in test_dict.items():
all_reality += len(reality) # calculate recall
prediction = recommend(movie_info, user_preference, train_dict, u, n)
# 将预测结果打印出来
with open("task1_0.csv","a+") as f:
f.write(str(u))
f.write(":")
for movie_id in prediction.keys():
f.write(str(movie_id))
f.write(",")
f.write("\n")
all_prediction += len(prediction) # calculate precision
for i in prediction.keys():
recommended.add(i) # calculate coverage
pop += math.log(1 + len(reverse_dict.get(i,[]))) # calculate popularity
if i in reality:
hit += 1
for u in test_dict.keys():
for i in train_dict[u].keys():
all_movies.add(i)
return hit/all_reality, hit/all_prediction, len(recommended)/len(all_movies), pop/all_prediction
if __name__ == "__main__":
user = 2
k=20 # 选择最近的k个朋友
n=10 # 推荐n部电影, test里面平均有4-5部电影
movie_info = get_movie_info(movie_file)
train_record, test_record = get_split_ratings(file_name)
user_prefer = get_user_preference(movie_info, train_record)
train_dict = get_dict(train_record, "train")
test_dict = get_dict(test_record, "test")
reverse_dict = get_movie_user_dict(train_record)
########################################################################################
a,b,c,d = evaluate(movie_info, user_prefer, train_dict, test_dict, reverse_dict, n)
print('n:%d' % n)
print("recall: %f, precision: %f, coverage: %f, popularity: %f" % (a, b, c, d))
with open("task1_1.csv", "a+") as f:
f.write("召回率,精确率,覆盖率,流行度,新颖性\n")
f.write("%f,%f,%f,%f,%f"%(a,b,c,d,1/d))
########################################################################################
# print(redis.keys())
# print(redis.delete('train_dict'))
# print(redis.delete('user_simi_dict'))
# print(redis.delete('test_dict'))
Python
1
https://gitee.com/earth_wyz/movieRS.git
git@gitee.com:earth_wyz/movieRS.git
earth_wyz
movieRS
电影推荐系统
master

搜索帮助