1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | # -*- coding:utf-8 -*- critics = {'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,'The Night Listener': 3.0}, 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 3.5}, 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0, 'Superman Returns': 3.5, 'The Night Listener': 4.0}, 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 2.5}, 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0, 'You, Me and Dupree': 2.0}, 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5}, 'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0}, 'Yu':{'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,'The Night Listener': 3.0}} from math import sqrt print u"距离与相关系数:它们之间是相反的,若距离越短(距离的数值越小),则相似度越大(相似度的数值越大)" # 欧几里得距离 def sim_distance(prefs, person1, person2): # 得到两者同时评价过的电影的列表 si = { } for item in prefs[person1]: if item in prefs[person2]: si[item] = 1 # 若不存在同时评价过的电影则返回0 if len(si) == 0 : return 0 # 计算所有差值的平方和 sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item], 2) for item in prefs[person1] if item in prefs[person2]]) # sum()函数中的参数是一个list,sum([item for item in a if item in b]) return 1 / (1+sqrt(sum_of_squares)) print u"欧几里得距离(最后给出的数值,实际上是给出了相似度评价):" print(sim_distance(critics, 'Lisa Rose', 'Gene Seymour')) # 皮尔逊相关系数 def sim_pearson(prefs, p1, p2): si = { } for item in prefs[p1]: if item in prefs[p2]: si[item] = 1 n = len(si) if n == 0: return 1 #如果两者不存在同时评论过的电影时,返回1 #对所有偏好求和 sum1 = sum([prefs[p1][it] for it in si]) sum2 = sum([prefs[p2][it] for it in si]) #求平方和 sum1Sq = sum([pow(prefs[p1][it], 2) for it in si]) sum2Sq = sum([pow(prefs[p2][it], 2) for it in si]) #求乘积之和 pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si]) #计算皮尔逊评价值 num = pSum - (sum1 * sum2 / n) den = sqrt((sum1Sq-pow(sum1, 2)/n) * (sum2Sq-pow(sum2, 2)/n)) if den == 0: return 0 r = num/den return r print u"皮尔逊相关系数:" print (sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')) # Jaccard相似度(狭义)——只能用于判断两者之间是否一致,而不能根据其评分来判定相似度 def sim_jaccard(prefs, per1, per2): si_union = { } #并集 si_inter = { } #交集 si_union = dict(prefs[per1], **prefs[per2]) for item in prefs[per1]: if item in prefs[per2]: si_inter[item] = min(prefs[per1][item], prefs[per2][item]) sum1 = len(si_inter) sum2 = len(si_union) if (sum2 == 0): return 0 r = float(sum1) / sum2 return r print u"Jaccard相似度(狭义)——只能用于判断两者之间是否一致,而不能根据其评分来判定相似度:" print sim_jaccard(critics, 'Lisa Rose', 'Gene Seymour') #曼哈顿距离(城市街区距离 ) def sim_manhattan(prefs, p1, p2): si = { } for item in p1: if item in p2: si[item] = 1 if len(item) == 0: return 1 sum_of_minus = sum([abs(prefs[p1][item] - prefs[p2][item]) for item in prefs[p1] if item in prefs[p2]]) return 1 / (sum_of_minus+1) print u"曼哈顿距离(最后得到的数值也是相似度):" print sim_manhattan(critics, 'Lisa Rose', 'Gene Seymour') |