127 lines
4.2 KiB
Python
127 lines
4.2 KiB
Python
#!/usr/bin/env python
|
|
# title: kmeansMkI.py
|
|
# description: Our personal Python K-Means++ implementation
|
|
# author: Tillmann Brendel, Conrad Großer
|
|
# license: Pending
|
|
# date: 04.06.2018
|
|
# version: 1.6
|
|
# usage: python pyscript.py
|
|
# notes:
|
|
# known_issues:
|
|
# python_version: 3.x
|
|
# ==============================================================================
|
|
|
|
# IMPORTS
|
|
|
|
# Importing the time for benchmarking purposes
|
|
import time
|
|
|
|
# For random generation of numbers import randint
|
|
from random import randint
|
|
|
|
# Importing libaries for easy plotting
|
|
import matplotlib.pyplot as plt
|
|
|
|
# Importing own libaries Datamining Libary and Datamining Test
|
|
import dmlib
|
|
import dmtest
|
|
|
|
|
|
# Main function of the algorithm
|
|
def kmeansmk1(xdata, ydata, clusters):
|
|
# Defining cluster points
|
|
for i in range(0, clusters):
|
|
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
|
|
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
|
|
|
# Get the maximum of the data
|
|
highpointx = max(xdata)
|
|
highpointy = max(ydata)
|
|
|
|
# Define variables for running the algorithm (runs is just as important as every other variable)
|
|
done = False
|
|
runs = 0
|
|
|
|
# As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters
|
|
while not done:
|
|
runs += 1
|
|
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
|
|
# assigned_points consists of the clusternumbers
|
|
done = calcClusters(xdata, ydata, assigned_points, clusters)
|
|
|
|
for i in range(0, clusters):
|
|
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
|
for i in range(0, clusters):
|
|
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
|
|
|
|
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
|
|
plt.show()
|
|
|
|
|
|
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
|
def calcClusters(xdata, ydata, assigned_points, clusters):
|
|
for cluster in range(0, clusters):
|
|
cpointunchanged = True
|
|
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
|
|
clustersumx = 0
|
|
clustersumy = 0
|
|
count = 0
|
|
|
|
for item in range(0, len(xdata)):
|
|
if assigned_points[item] == cluster:
|
|
clustersumx = clustersumx + int(xdata[item])
|
|
clustersumy = clustersumy + int(ydata[item])
|
|
count = count + 1
|
|
|
|
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
|
|
|
|
# checking if old clusterpoint is equal to the one just calculated
|
|
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
|
|
cpointunchanged = False
|
|
|
|
return cpointunchanged
|
|
|
|
|
|
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
|
|
data_assigned = []
|
|
assigned_cluster = 0
|
|
resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy])
|
|
|
|
for item in range(0, len(xdata)):
|
|
olddistance = resetdist
|
|
for cluster in range(0, clusters):
|
|
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
|
|
|
|
if distance < olddistance:
|
|
olddistance = distance
|
|
assigned_cluster = cluster
|
|
|
|
data_assigned.append(assigned_cluster)
|
|
|
|
return data_assigned
|
|
|
|
|
|
# Startup function for collecting necesarry xdata
|
|
def startup(xdata, ydata):
|
|
# Using two clusters for testing
|
|
clusters = int(input("How many clusters are known? "))
|
|
# cores = input("How many cores should be used? ")
|
|
# path = input("Where is the xdata? ") or in this case xdata
|
|
|
|
# For benchmarking starting the timer now
|
|
start_time = time.time()
|
|
|
|
# Firing up the engines!
|
|
kmeansmk1(xdata, ydata, clusters)
|
|
|
|
# Stopping benchmark
|
|
seconds = time.time() - start_time
|
|
print(str(seconds) + " seconds for execution")
|
|
|
|
|
|
# Start the algorithm and generate test xdata
|
|
xdata = dmtest.numGenLight(10000, False, 5)
|
|
ydata = dmtest.numGenLight(10000, False, 2)
|
|
|
|
startup(xdata, ydata)
|