Question: it takes forever to run the question 5 . 3 and 5 . 4 due to the large data, please re - do the codes,

it takes forever to run the question 5.3 and 5.4 due to the large data, please re-do the codes, so that it takes shorter time. In addition, please add a code to check that there are no edges between nodes of the same type. def load_github_data()-> Tuple[nx.Graph, List[str], List[str]]:
"""
Returns:
G: NetworkX graph object
uid_list (list): list of users
pid_list (list): list of projects
"""
G = nx.Graph()
uid_list=set()
pid_list=set()
with open('github_data.txt','r') as file:
for line in file:
user_id, project_id = line.split()
G.add_node(user_id, bipartite=0)
G.add_node(project_id, bipartite=1)
G.add_edge(user_id, project_id)
uid_list.add(user_id)
pid_list.add(project_id)
# NOTE: We are also returning a list of users and projects. This will be helpful
# when getting the correct user and project indicies from the projections.
uid_list =sorted(uid_list)
pid_list= sorted(pid_list)
return G, uid_list, pid_list # Create a function to create the User-Project Matrix
def calculate_projections(G, uid_list, pid_list)-> Tuple[sp.sparse.spmatrix, sp.sparse.spmatrix]:
"""
Inputs:
G: NetworkX graph object
uid_list (list): list of users
pid_list (list): list of projects
Returns:
user_matrix (sp.sparse.spmatrix): one mode projection for users
project_matrix (sp.sparse.spmatrix): one mode projection for projects
"""
"""
users_projection = np.array([0,0])
projects_projection = np.array([0,0])
"""
adjacency_matrix = bipartite.biadjacency_matrix(G, row_order=uid_list, column_order=pid_list)
users_projection = adjacency_matrix.dot(adjacency_matrix.T)
projects_projection = adjacency_matrix.T.dot(adjacency_matrix)
return users_projection, projects_projection # Write a function that will return the pair of users that share the highest number of Github projects between them.
def get_user_pair(M, uid_list)-> Tuple[str, str]:
"""
Inputs:
M: projected matrix
uid_list (list): list of users
Returns:
u1(str)- first user
u2(str)- second user
"""
max_value =-1
n = len(uid_list)
for i in range(n):
for j in range(i +1, n):
if M[i, j]> max_value:
max_value = M[i, j]
u1,u2= uid_list[i], uid_list[j]
return u1, u2 # Write a function that will return the pair of projects that share the highest number of users between them.
def get_project_pair(M, pid_list)-> Tuple[str, str]:
"""
Inputs:
M: projected matrix
pid_list (list): list of projects
Returns:
p1(str)- first project
p2(str)- second project
"""
#max_value = np.max(M)
#indices = np.where(pid_list==max_value)
#p1= pid_list[indices[0]]
#p2= pid_list[indices[1]]
max_value =-1
M = sp.sparse.csr_matrix(M)
n = len(pid_list)
for i in range(n):
for j in range(i +1, n):
if M[i, j]> max_value:
max_value = M[i, j]
p1,p2= pid_list[i], pid_list[j]
return p1[0], p2[0]

Step by Step Solution

There are 3 Steps involved in it

1 Expert Approved Answer
Step: 1 Unlock blur-text-image
Question Has Been Solved by an Expert!

Get step-by-step solutions from verified subject matter experts

Step: 2 Unlock
Step: 3 Unlock

Students Have Also Explored These Related Databases Questions!