-
Notifications
You must be signed in to change notification settings - Fork 0
/
graph_construction.py
361 lines (292 loc) · 12.8 KB
/
graph_construction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
"""CSC111 Winter 2021 Project Graph File
Instructions
===============================
This Python module contains a Graph class from University of Toronto's CSC111
course, and builds a network of movies and reviewers. The 'to_networkx' function
is taken from CSC111's assignment 3, where the link is referenced below.
This file also contains functions to produce recommendations based on our network.
CSC111 Course Notes: https://www.teach.cs.toronto.edu/~csc110y/fall/notes/
The link to assignment 3 can be found here,
https://www.teach.cs.toronto.edu/~csc111h/winter/assignments/a3/handout/
Copyright and Usage Information
===============================
This file is Copyright (c) 2021
Rafee Rahman, Michael Galorro, Kimiya Raminrad, Mojan Majid
"""
from __future__ import annotations
from typing import Any, Union
import json
import random
import pandas as pd
import networkx as nx
class _Vertex:
"""A vertex in a movie review graph, used to represent a reviewer or a movie.
Instance Attributes:
- item: The data stored in this vertex, representing a user or book.
- kind: The type of this vertex: 'user' or 'book'.
- neighbours: The vertices that are adjacent to this vertex, and their corresponding
edge weights.
Representation Invariants:
- self not in self.neighbours
- all(self in u.neighbours for u in self.neighbours)
- self.kind in {'reviewer', 'movie'}
"""
item: Any
kind: str
neighbours: dict[_Vertex, Union[int, float]]
def __init__(self, item: Any, kind: str) -> None:
"""Initialize a new vertex with the given item and kind.
This vertex is initialized with no neighbours.
Preconditions:
- kind in {'reviewer', 'movie'}
"""
self.item = item
self.kind = kind
self.neighbours = {}
def degree(self) -> int:
"""Return the degree of this vertex."""
return len(self.neighbours)
def reviewer_similarity_score(self, other: _Vertex) -> float:
"""Return the similarity score between this vertex and other.
Similarity score is based on how many movies both reviewers rated, and if those scores are
similar.
Preconditions:
- self.kind == 'reviewer'
- other.kind == 'reviewer'
"""
if self.degree() == 0 or other.degree == 0:
return 0.0
else:
neighbours = self.neighbours
other_neighbours = other.neighbours
same_neighbours = neighbours.keys() & other_neighbours.keys()
union = len(self.neighbours) + len(other.neighbours)
sim_score_so_far = 0
for vertex in same_neighbours:
# 'bothered reviewing' bonus:
sim_score_so_far += 1
# 'love' bonus
if self.neighbours[vertex] >= 9 and other.neighbours[vertex] >= 9:
sim_score_so_far += 2
# 'like' bonus
elif self.neighbours[vertex] >= 7 and other.neighbours[vertex] >= 7:
sim_score_so_far += 1
return sim_score_so_far / union
class Graph:
"""A weighted graph used to represent a network of movie reviews which keeps track of ratings.
"""
# Private Instance Attributes:
# - _vertices:
# A collection of the vertices contained in this graph.
# Maps item to _Vertex object.
_vertices: dict[Any, _Vertex]
def __init__(self) -> None:
"""Initialize an empty graph, with no vertices or edges."""
self._vertices = {}
def add_reviewer(self, movies: list[str]) -> None:
"""Add a reviewer vertex to this graph with every movie in movies reviewed with a
10 rating. Each edge to a movie is one-way.
Preconditions:
- all([movie in graph.get_all_vertices() for movie in movies])"""
reviewer = _Vertex('CSC111_Reviewer', 'reviewer')
for movie in movies:
m_vertex = self._vertices[movie]
reviewer.neighbours[m_vertex] = 10
self._vertices['CSC111_Reviewer'] = reviewer
def suggest_movies(self, reviewer: Any, other: Any) -> list[Any]:
"""Suggests movies for reviewer based on movies that the other reviewer has rated highly.
Returns an empty list if there are no good suggestions available.
Preconditions:
- reviewer in graph.get_all_vertices()
- other in graph.get_all_vertices()
"""
potential_recs = self.get_neighbours(other)
suggestions_so_far = []
neighbours = self.get_neighbours(reviewer)
for p_rec in potential_recs:
if p_rec not in neighbours and self.get_weight(other, p_rec) >= 9:
suggestions_so_far.append(p_rec)
return suggestions_so_far
def add_vertex(self, item: Any, kind: str) -> None:
"""Add a vertex with the given item and kind to this graph.
The new vertex is not adjacent to any other vertices.
Do nothing if the given item is already in this graph.
Preconditions:
- kind in {'reviewer', 'movie'}
"""
if item not in self._vertices:
self._vertices[item] = _Vertex(item, kind)
def add_edge(self, item1: Any, item2: Any, weight: Union[int, float]) -> None:
"""Add an edge between the two vertices with the given items in this graph,
with the given weight.
Raise a ValueError if item1 or item2 do not appear as vertices in this graph.
Preconditions:
- item1 != item2
"""
if item1 in self._vertices and item2 in self._vertices:
v1 = self._vertices[item1]
v2 = self._vertices[item2]
# Add the new edge
v1.neighbours[v2] = weight
v2.neighbours[v1] = weight
else:
# We didn't find an existing vertex for both items.
raise ValueError
def get_neighbours(self, item: Any) -> set:
"""Return a set of the neighbours of the given item.
Raise a ValueError if item does not appear as a vertex in this graph.
"""
if item in self._vertices:
v = self._vertices[item]
return {neighbour.item for neighbour in v.neighbours}
else:
raise ValueError
def get_weight(self, item1: Any, item2: Any) -> Union[int, float]:
"""Return the weight of the edge between the given items.
Return 0 if item1 and item2 are not adjacent.
Preconditions:
- item1 and item2 are vertices in this graph
"""
v1 = self._vertices[item1]
v2 = self._vertices[item2]
return v1.neighbours.get(v2, 0)
def get_all_vertices(self, kind: str = '') -> set:
"""Return a set of all vertex items in this graph.
If kind != '', only return the items of the given vertex kind.
Preconditions:
- kind in {'', 'reviewer', 'movie'}
"""
if kind != '':
return {v.item for v in self._vertices.values() if v.kind == kind}
else:
return set(self._vertices.keys())
def get_similarity_score(self, reviewer1: Any, reviewer2: Any) -> float:
""" Return the similarity score between the two given reviewers in this graph.
Raise a ValueError if reviewer1 or reviewer2 do not appear as vertices in this graph.
Preconditions:
-reviewer1 in self.get_all_vertices()
-reviewer 2 in self.get_all_vertices()
"""
v1 = self._vertices[reviewer1]
v2 = self._vertices[reviewer2]
return v1.reviewer_similarity_score(v2)
def to_networkx(self, max_vertices: int = 5000) -> nx.Graph:
""" This code is from University of Toronto's CSC111, Assignment 3.
The link to assignment 3 can be found here,
https://www.teach.cs.toronto.edu/~csc111h/winter/assignments/a3/handout/
Convert this graph into a networkx Graph.
max_vertices specifies the maximum number of vertices that can appear in the graph.
(This is necessary to limit the visualization output for large graphs.)
Note that this method is provided for you, and you shouldn't change it.
"""
graph_nx = nx.Graph()
for v in self._vertices.values():
graph_nx.add_node(v.item, kind=v.kind)
for u in v.neighbours:
if graph_nx.number_of_nodes() < max_vertices:
graph_nx.add_node(u.item, kind=u.kind)
if u.item in graph_nx.nodes:
graph_nx.add_edge(v.item, u.item)
if graph_nx.number_of_nodes() >= max_vertices:
break
return graph_nx
def load_review_graph_df(df: pd.DataFrame, threshold: int = 0) -> Graph:
"""Return a movie review graph from the given data set. Only includes reviewers with more
reviews than the given threshold. Default threshold is 0.
Preconditions:
- df is a dataframe returned by clean_dataframe
"""
graph = Graph()
reviewers = {}
for i in range(0, len(df['reviewer'])):
reviewer = df['reviewer'][i]
movie = df['movie'][i]
rating = df['rating'][i]
if reviewer not in reviewers:
reviewers[reviewer] = [(movie, rating)]
else:
reviewers[reviewer].append((movie, rating))
for reviewer in reviewers:
if len(reviewers[reviewer]) > threshold:
graph.add_vertex(reviewer, 'reviewer')
for movie in reviewers[reviewer]:
rating = int(movie[1])
graph.add_vertex(movie[0], 'movie')
graph.add_edge(reviewer, movie[0], rating)
return graph
def load_review_graph_json(reviews_file: str, threshold: int = 0) -> Graph:
"""Return a movie review graph from the given data set. Only includes reviewers with more
reviews than the given threshold. Default threshold is 0.
Preconditions:
- reviews_file is a JSON file returned by create_json
"""
graph = Graph()
reviewers = {}
with open(reviews_file) as json_file:
data = json.load(json_file)
for review in data['data']:
reviewer = review[0]
movie = review[1]
rating = int(review[2])
if reviewer not in reviewers:
reviewers[reviewer] = [(movie, rating)]
else:
reviewers[reviewer].append((movie, rating))
for reviewer in reviewers:
if len(reviewers[reviewer]) > threshold:
graph.add_vertex(reviewer, 'reviewer')
for movie in reviewers[reviewer]:
rating = int(movie[1])
graph.add_vertex(movie[0], 'movie')
graph.add_edge(reviewer, movie[0], rating)
return graph
def get_suggestions(reviewer: Any, graph: Graph, threshold: int = 10) -> list[Any]:
"""Return a list of movie suggestions based on the similarity score of the given reviewer
in relation to the rest of the reviewers in the given graph. The list is at
most as long as the given threshold.
Preconditions:
- threshold >= 1
- reviewer in graph.get_all_vertices()
"""
reviewers_so_far = helper(reviewer, graph)
sim_scores = {}
for user in reviewers_so_far:
sim_score = round(graph.get_similarity_score(user, reviewer), 2)
if sim_score > 0:
if sim_score not in sim_scores:
sim_scores[sim_score] = [user]
else:
sim_scores[sim_score].append(user)
recommendations_so_far = set()
while len(recommendations_so_far) < threshold and len(sim_scores) > 0:
similar_reviewers = sim_scores[max(sim_scores)]
if similar_reviewers != []:
sim_user = similar_reviewers.pop(random.randint(0, len(similar_reviewers) - 1))
rec_movies = graph.suggest_movies(reviewer, sim_user)
for movie in rec_movies:
recommendations_so_far.add(movie)
else:
sim_scores.pop(max(sim_scores))
recommendations = list(recommendations_so_far)
while len(recommendations) > threshold:
recommendations.pop()
if len(recommendations) == 0:
return [' recommendations not found. Try adding more movies!']
else:
return recommendations
def helper(reviewer: Any, graph: Graph) -> set:
"""this is a helper function for get_suggestions()"""
reviewers_so_far = set()
for movie in graph.get_neighbours(reviewer):
for user in graph.get_neighbours(movie):
if graph.get_weight(user, movie) >= 8:
reviewers_so_far.add(user)
return reviewers_so_far
if __name__ == '__main__':
import python_ta
python_ta.check_all(config={
'extra-imports': ['networkx', 'pandas', 'json', 'random'],
'allowed-io': ['load_review_graph_json'],
'max-line-length': 100,
'disable': ['E1136']
})