Source code for markovianSBM.Clustering

import numpy as np
import cvxpy as cp
import numpy as np
import os


[docs]def add_liste(Level, Liste, Node, UP=True): if Level==-1: Liste = [Node] + Liste elif Level >= len(Liste): Liste.append([Node]) if UP: Level -= 1 else: Level += 1 else: Liste.append([Node]) if UP: Level -= 1 else: Level += 1 return Level, Liste
[docs]def recu(node, level, graph, liste, node2ind, dejavu, dico_closest, nodes): L = [] for ind in range(len(nodes)): if graph[ind,node2ind[node]] and dejavu[ind]==0: L.append(ind) for ind in L: if graph[ind,node2ind[node]] and dejavu[ind]==0: level, liste = add_liste(level, liste, nodes[ind], UP=True) dejavu[ind] = 1 graph, liste, dejavu = recu(nodes[ind], level, graph, liste, node2ind, dejavu, dico_closest, nodes) return graph, liste, dejavu
[docs]class Clustering: """ Class that performs the final rounding step on the rows of the matrix $\hat{B}$ which is the optimal solution of the SDP relaxation of the K-means problem. """ def __init__(self, n, K): """ :param n: Nomber of nodes in the graph :param K: Number of clusters """ self.n = n self.K = K
[docs] def solve_relaxed_LP(self, M): """ Prelimanary to run the K-medoid algorithm. :param M: M is the output of the relaxed SDP problem. """ barx = cp.Variable((self.n,self.n)) bary = cp.Variable(self.n) C = np.zeros((self.n,self.n)) for i in range(1, self.n): for j in range(i): C[i,j] = np.linalg.norm(M[:,i]-M[:,j], ord=1) C[j,i] = C[i,j] constraints = [ cp.sum(barx[:,j]) == 1 for j in range(self.n)] constraints += [ - barx[i%self.n,i//self.n] + bary[i%self.n] >= 0 for i in range(self.n * self.n) ] constraints += [ self.K - cp.sum(bary) >= 0 ] constraints += [ barx[i%self.n,i//self.n] >=0 for i in range(self.n * self.n) ] constraints += [ bary[i] >=0 for i in range(self.n) ] prob = cp.Problem(cp.Minimize(cp.trace(C@barx)), constraints) prob.solve() barx = barx.value bary = bary.value return barx, bary, C
[docs] def Kmedoids(self, barx, bary, C): """ Kmedoid algorithm that performs a rounding step on the rows of $\hat{B}$. :param barx: Output of the method 'solve_relaxed_LP' :param bary: Output of the method 'solve_relaxed_LP' :param C: Output of the method 'solve_relaxed_LP' """ # Step 1 : consolidating locations ## It consists in moving revelantly demand. ## All locations with positive demand will be far from eachother, garanteeing: ## - not increase the cost of the fractional solution ## - each feasible integer solution for the modified instance can be converted to a feasible integer ## solution for the original instance with a small added cost d = np.ones(self.n) barx = barx * (1* (barx>0)) barC = np.diag(C @ barx) ind = np.argsort(barC) for j in range(self.n): for i in range(j): if d[ind[i]]>0 and C[ind[i],ind[j]]<=4*barC[ind[j]]: d[ind[i]] += d[ind[j]] d[ind[j]] = 0 # Step 2 : Consolidating centers ## Construction of a 1/2-resticted solution ## y_i=0 if d[i]=0 and y_i >=1/2 otherwise (without paying this too much). px = np.copy(barx) py = np.copy(bary) # Closest[i] will be the closest node to i (except i), let's say j, satisfying d[j]>0 closest = [] dico_weights = {} sorted_lines_C = np.zeros((self.n,self.n)) for i in range(self.n): ls = np.argsort(C[i,:]) sorted_lines_C[i,:] = ls j = 0 while (d[int(ls[j])]==0 or i==int(ls[j])): j += 1 closest.append(ls[j]) # We only move the nodes with a null demand and a partial open center (i.e y>0) and thus that can't be a center if d[i]==0 and py[i]>0: # We assign i to the closest point j such that d[j] > 0 py[ls[j]] = min(1 , py[i] + py[ls[j]]) py[i] = 0 for pj in range(self.n): px[ls[j],pj] += px[i,pj] px[i,pj] = 0 if d[i]>0: dico_weights[i] = d[i]*C[closest[i],i] ## Construction of a (1/2-1)-integral solution ## y_i=0 if d[i]=0 and y_i =1/2 or 1 otherwise (without paying this too much). pn = len(dico_weights) haty = np.zeros(self.n) hatx = np.zeros((self.n,self.n)) # We sort the locations j \in N' (ie such that d[j]>0) in decreasing order of their weight sorted_dico_weights = {k: v for k, v in sorted(dico_weights.items(), key=lambda item: item[1], reverse=True)} list_keys = sorted_dico_weights.keys() #list of nodes in N' (i.e. with d[i]>0) for ind, j in enumerate(list_keys): if ind < 2*self.K-pn: haty[j] = 1 else: haty[j] = 1/2 hatx[closest[j],j] = 1/2 hatx[j,j] = haty[j] # Step 3 : rounding a {1/2, 1}-integral solution to an integral one centers = [k for k in list_keys if haty[k] == 1] odd_level = [] even_level = [] dico_closest = {i:closest[i] for i in list_keys if haty[i]==1/2} import copy dic = dico_closest.copy() for key,value in dic.items(): try: if dico_closest[value]==key: del dico_closest[key] except: pass keys = list(dico_closest.keys()) values = list(dico_closest.values()) nodes = list(set(keys+values)) graph = np.zeros((len(nodes),len(nodes))) node2ind = {node:ind for ind,node in enumerate(nodes)} for ind,key in enumerate(keys): graph[node2ind[key],node2ind[values[ind]]] = 1 dejavu = np.zeros(len(nodes)) while len(dejavu)!=np.sum(dejavu): notfound = True indj = 0 level = 0 liste = [] while notfound and indj<len(nodes): if np.sum(graph[:,indj])==0 and np.sum(graph[indj,:])>0 and dejavu[indj]==0: notfound = False j = nodes[indj] liste.append([j]) s_j = dico_closest[j] liste.append([s_j]) inds_j = node2ind[s_j] dejavu[indj] = 1 dejavu[inds_j] = 1 graph, liste, dejavu = recu(s_j, 1, graph, liste, node2ind, dejavu, dico_closest, nodes) level = 2 j = s_j indj = inds_j leaf = False while not(leaf): if np.sum(graph[indj,:])>0: s_j = dico_closest[j] inds_j = node2ind[s_j] if dejavu[inds_j]==0: dejavu[inds_j] = 1 graph, liste, dejavu = recu(s_j, level-1, graph, liste, node2ind, dejavu, dico_closest, nodes) level, liste = add_liste(level, liste, s_j, UP=False) j = s_j indj = inds_j else: leaf = True else: leaf = True indj += 1 for ind in range(len(liste)): if ind % 2: even_level += liste[ind] else: odd_level += liste[ind] if len(odd_level)<len(even_level): centers = centers + list(set(odd_level)) else: centers = centers + list(set(even_level)) centers = list(set(centers)) # -1 is assigned to the nodes that are not centers. Otherwise, we numerote them. num_center = -np.ones(self.n) for ind, j in enumerate(centers): num_center[j] = int(ind) A = np.zeros((self.n,self.K)) for i in range(self.n): if num_center[i] == -1: ls = sorted_lines_C[i,:] count = 0 while num_center[int(ls[count])] == -1: count += 1 A[i,int(num_center[int(ls[count])])] = 1 else: A[i,int(num_center[i])] = 1 self.clusters_approx = [] for i in range(self.n): for j in range(self.K): if A[i,j]==1: self.clusters_approx.append(j) self.true_partition = self.build_partition(self.clusters) approx_partition = self.build_partition(self.clusters_approx) self.find_permutation(self.true_partition, approx_partition) inverse = [0] * len(self.permutation) for i, p in enumerate(self.permutation): inverse[p] = i clusts_approx = np.copy(self.clusters_approx) for i,group in enumerate(clusts_approx): self.clusters_approx[i] = inverse[group]
[docs] def build_partition(self, clust): """ Given a list clust that associates to each node its community, this methods builds the associated partition of the noes of the graph. :param clust: List of estimated clusters for the n nodes of the graph """ d = {i:set() for i in range(self.K)} for i in range(len(clust)): d[clust[i]].add(i) return (d)
[docs] def find_permutation(self, true_partition, approx_partition): """ Find the permutation between the names of the true communities and the ones estimated by our algorithm. :param true_partition: True partition of the nodes in the graph according to their clusters :param approx_partition: Estimated partition of the nodes """ import itertools permus = list(itertools.permutations([i for i in range(self.K)])) best_error = np.float('inf') for permu in permus: error = 0 for k in range(self.K): try: error += len(true_partition[k]-approx_partition[permu[k]]) except: error += len(true_partition[k]) if error < best_error: best_error = error self.permutation = permu