Graph Based Recommender

Importing necessary libraries

import networkx as nx
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community import kernighan_lin_bisection, louvain_communities
from networkx.algorithms.community.label_propagation import label_propagation_communities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams

Miscellaneous functions

def newColorHex():
    color = '#'
    for i in range(6):
        color += np.random.choice(list("6789ABCD"))
    return color

Reading CSV files containing relations between people and the products they bought

edgelist_df = pd.read_csv('data/relations.csv')
people_df = pd.read_csv('data/people.csv')
names = people_df.set_index('id').T.to_dict('list')
names = {k: v[0] for k, v in names.items()}


purchases = pd.read_csv('data/purchases.csv')
products = pd.read_csv('data/products.csv').set_index('id').T.to_dict()
product_names = {k: v['product'] for k, v in products.items() if k in purchases['product_id'].unique()}
product_prices = {k: int(v['price']) for k, v in products.items() if k in purchases['product_id'].unique()}

Plotting relations between people

G = nx.Graph()
G = nx.from_pandas_edgelist(edgelist_df, source='id1', target='id2')
G = nx.relabel_nodes(G, names)

rcParams['figure.figsize'] = 14, 10
pos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))
d = dict(G.degree)
nx.draw(G, pos, node_color=newColorHex(),
        with_labels=True,
        nodelist=d,
        node_size=[d[k]*300 for k in d])

Finding communities

Partitioning the set of people via centraility measures (Girvan-Newman algorithm)

G = nx.Graph()
G = nx.from_pandas_edgelist(edgelist_df, source='id1', target='id2')
G = nx.relabel_nodes(G, names)

communities = girvan_newman(G)

node_groups = []
for com in next(communities):
    node_groups.append(set(com))

group_colors = [newColorHex() for _ in node_groups]

color_map = []
for node in G:
    for group_num in range(len(node_groups)):
        if node in node_groups[group_num]:
            color_map.append(group_colors[group_num])

plt.figure(5, figsize=(14, 10))
nx.draw(G, node_color=color_map, with_labels=True, node_size=2000)

Finding bipartitions (Kernighan-Lin bipartition algorithm)

G = nx.Graph()
G = nx.from_pandas_edgelist(edgelist_df, source='id1', target='id2')
G = nx.relabel_nodes(G, names)

communities = kernighan_lin_bisection(G)

node_groups = []
for com in communities:
    node_groups.append(com)

group_colors = [newColorHex() for _ in node_groups]

color_map = []
for node in G:
    for group_num in range(len(node_groups)):
        if node in node_groups[group_num]:
            color_map.append(group_colors[group_num])

plt.figure(5, figsize=(14, 10))
nx.draw(G, node_color=color_map, with_labels=True, node_size=2000)

Finding communities using label propagation community detection algorithm

G = nx.Graph()
G = nx.from_pandas_edgelist(edgelist_df, source='id1', target='id2')
G = nx.relabel_nodes(G, names)

communities = label_propagation_communities(G)

node_groups = []
for com in communities:
    node_groups.append(com)

group_colors = [newColorHex() for _ in node_groups]

color_map = []
for node in G:
    for group_num in range(len(node_groups)):
        if node in node_groups[group_num]:
            color_map.append(group_colors[group_num])

plt.figure(5, figsize=(14, 10))
nx.draw(G, node_color=color_map, with_labels=True, node_size=2000)

Detecting communities using the Louvain community detection algorithm

G = nx.Graph()
G = nx.from_pandas_edgelist(edgelist_df, source='id1', target='id2')
G = nx.relabel_nodes(G, names)

communities = louvain_communities(G)

node_groups = []
for com in communities:
    node_groups.append(com)

group_colors = [newColorHex() for _ in node_groups]

color_map = []
for node in G:
    for group_num in range(len(node_groups)):
        if node in node_groups[group_num]:
            color_map.append(group_colors[group_num])

plt.figure(5, figsize=(14, 10))
nx.draw(G, node_color=color_map, with_labels=True, node_size=2000)

Plotting people and their purchases

G = nx.Graph()
G.add_edges_from(edgelist_df.values)
G.add_edges_from(purchases.values)

person_nodes = set(people_df.id.tolist())
product_nodes = set(purchases.product_id)

person_color = newColorHex()
product_color = newColorHex()

color_map = []
for node in G:
    if node in person_nodes:
        color_map.append(person_color)
    elif node in product_nodes:
        color_map.append(product_color)

G = nx.relabel_nodes(G, names)
G = nx.relabel_nodes(G, product_names)

plt.figure(5, figsize=(14, 10))
nx.draw(G, node_color=color_map, with_labels=True, node_size=2000)

Using resource allocation index

G = nx.Graph()
G.add_edges_from(edgelist_df.values)
G.add_edges_from(purchases.values)

ebunch = []
for node1 in G:
    for node2 in G:
        if node1 != node2 and (node1, node2) not in G.edges and (node2, node1) not in G.edges:
            if node1 in person_nodes and node2 in product_nodes:
                ebunch.append((node1, node2))

iterator_list = list(nx.resource_allocation_index(G, ebunch=ebunch))
iterator_list_sorted = sorted(iterator_list, key=lambda x: x[2], reverse=True)
rai_predictions = list(map(lambda x: (names[x[0]], product_names[x[1]], x[2]), iterator_list_sorted))

df_predictions = pd.DataFrame(rai_predictions, columns=['person', 'product', 'resource_allocation_index'])
df_predictions
personproductresource_allocation_index
0Ruben ShawSony Alpha a7 III Mirrorless Camera0.500000
1Sasha HowardGoogle Nest Mini (2nd Gen)0.416667
2Dereon BrightGoogle Nest Mini (2nd Gen)0.375000
3Paola WhitakerCanon EOS Rebel T7 DSLR Camera0.366667
4Sasha HowardNintendo Switch0.366667
............
229Katrina BarriPhone XR 64GB0.000000
230Katrina BarrPlayStation 50.000000
231Katrina BarrXbox Series X0.000000
232Katrina BarrDell XPS 130.000000
233Katrina BarrGarmin Forerunner 9450.000000

234 rows × 3 columns

Using Jaccard coefficient

G = nx.Graph()
G.add_edges_from(edgelist_df.values)
G.add_edges_from(purchases.values)

ebunch = []
for node1 in G:
    for node2 in G:
        if node1 != node2 and (node1, node2) not in G.edges and (node2, node1) not in G.edges:
            if node1 in person_nodes and node2 in product_nodes:
                ebunch.append((node1, node2))

jaccard_coefficients_list = list(nx.jaccard_coefficient(G, ebunch=ebunch))
jaccard_coefficients_sorted = sorted(jaccard_coefficients_list, key=lambda x: x[2], reverse=True)
jaccard_predictions = list(map(lambda x: (names[x[0]], product_names[x[1]], x[2]), jaccard_coefficients_sorted))

df_jaccard_predictions = pd.DataFrame(jaccard_predictions, columns=['person', 'product', 'jaccard_coefficient'])
df_jaccard_predictions
personproductjaccard_coefficient
0Lilliana KeithPlayStation 50.500000
1Lilliana KeithNintendo Switch0.333333
2Camille KhanGarmin Forerunner 9450.333333
3Kenny KiddPlayStation 50.333333
4Damon WebbiPhone XR 64GB0.333333
............
229Katrina BarriPhone XR 64GB0.000000
230Katrina BarrPlayStation 50.000000
231Katrina BarrXbox Series X0.000000
232Katrina BarrDell XPS 130.000000
233Katrina BarrGarmin Forerunner 9450.000000

234 rows × 3 columns

Using Adamic-Adar index

G = nx.Graph()
G.add_edges_from(edgelist_df.values)
G.add_edges_from(purchases.values)

ebunch = []
for node1 in G:
    for node2 in G:
        if node1 != node2 and (node1, node2) not in G.edges and (node2, node1) not in G.edges:
            if node1 in person_nodes and node2 in product_nodes:
                ebunch.append((node1, node2))

iterator_list = list(nx.resource_allocation_index(G, ebunch=ebunch))
iterator_list_sorted = sorted(iterator_list, key=lambda x: x[2], reverse=True)
aai_predictions = list(map(lambda x: (names[x[0]], product_names[x[1]], x[2]), iterator_list_sorted))

df_predictions = pd.DataFrame(aai_predictions, columns=['person', 'product', 'adamic_adar_index'])
df_predictions
personproductadamic_adar_index
0Ruben ShawSony Alpha a7 III Mirrorless Camera0.500000
1Sasha HowardGoogle Nest Mini (2nd Gen)0.416667
2Dereon BrightGoogle Nest Mini (2nd Gen)0.375000
3Paola WhitakerCanon EOS Rebel T7 DSLR Camera0.366667
4Sasha HowardNintendo Switch0.366667
............
229Katrina BarriPhone XR 64GB0.000000
230Katrina BarrPlayStation 50.000000
231Katrina BarrXbox Series X0.000000
232Katrina BarrDell XPS 130.000000
233Katrina BarrGarmin Forerunner 9450.000000

234 rows × 3 columns

Using common neighbour and centrality based parameterized algorithm score

G = nx.Graph()
G.add_edges_from(edgelist_df.values)
G.add_edges_from(purchases.values)

ebunch = []
for node1 in G:
    for node2 in G:
        if node1 != node2 and (node1, node2) not in G.edges and (node2, node1) not in G.edges:
            if node1 in person_nodes and node2 in product_nodes:
                ebunch.append((node1, node2))

iterator_list = list(nx.common_neighbor_centrality(G, ebunch=ebunch, alpha=0.8)) # default value alpha = 0.8
iterator_list_sorted = sorted(iterator_list, key=lambda x: x[2], reverse=True)
ccpa_predictions = list(map(lambda x: (names[x[0]], product_names[x[1]], x[2]), iterator_list_sorted))

df_predictions = pd.DataFrame(ccpa_predictions, columns=['person', 'product', 'ccpa_score'])
df_predictions
personproductccpa_score
0Paola WhitakerNintendo Switch5.000000
1Paola WhitakerCanon EOS Rebel T7 DSLR Camera5.000000
2Lilliana KeithMacBook Air M15.000000
3Lilliana KeithNintendo Switch5.000000
4Lilliana KeithSamsung Galaxy S105.000000
............
229Yusuf WagnerAmazon Echo Dot (4th Gen)1.360000
230Brennan TrevinoGoogle Nest Mini (2nd Gen)1.360000
231Brennan TrevinoPlayStation 51.360000
232Brennan TrevinoAmazon Echo Dot (4th Gen)1.360000
233Damon WebbAmazon Echo Dot (4th Gen)1.133333

234 rows × 3 columns>