Misc Cleanup Update
This commit is contained in:
131
src/algorithms/k-means/kmeansMkI_2d.py
Normal file
131
src/algorithms/k-means/kmeansMkI_2d.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
# title: kmeansMkI.py
|
||||
# description: Our personal Python K-Means++ implementation
|
||||
# author: Tillmann Brendel, Conrad Großer
|
||||
# license: Pending
|
||||
# date: 04.06.2018
|
||||
# version: 1.6
|
||||
# usage: python pyscript.py
|
||||
# notes:
|
||||
# known_issues:
|
||||
# python_version: 3.x
|
||||
# ==============================================================================
|
||||
|
||||
# IMPORTS
|
||||
|
||||
# Importing the time for benchmarking purposes
|
||||
import time
|
||||
from datetime import date
|
||||
|
||||
# For random generation of numbers import randint
|
||||
from random import randint
|
||||
|
||||
# Importing libary for multi core processing
|
||||
import multiprocessing
|
||||
|
||||
# Importing libaries for easy plotting
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Importing own libaries Datamining Libary and Datamining Test
|
||||
import dmlib
|
||||
import dmtest
|
||||
|
||||
|
||||
# Main function of the algorithm
|
||||
def kmeansmk1(xdata, ydata, clusters):
|
||||
# Defining cluster points
|
||||
for i in range(0, clusters):
|
||||
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
|
||||
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
||||
|
||||
# Get the maximum of the data
|
||||
highpointx = max(xdata)
|
||||
highpointy = max(ydata)
|
||||
|
||||
# Define variables for running the algorithm (runs is just as important as every other variable)
|
||||
done = False
|
||||
runs = 0
|
||||
|
||||
# As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters
|
||||
while not done:
|
||||
runs += 1
|
||||
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
|
||||
# assigned_points consists of the clusternumbers
|
||||
done = calcClusters(xdata, ydata, assigned_points, clusters)
|
||||
|
||||
for i in range(0, clusters):
|
||||
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
||||
for i in range(0, clusters):
|
||||
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
|
||||
|
||||
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
|
||||
plt.show()
|
||||
|
||||
|
||||
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
||||
def calcClusters(xdata, ydata, assigned_points, clusters):
|
||||
for cluster in range(0, clusters):
|
||||
cpointunchanged = True
|
||||
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
|
||||
clustersumx = 0
|
||||
clustersumy = 0
|
||||
count = 0
|
||||
|
||||
for item in range(0, len(xdata)):
|
||||
if assigned_points[item] == cluster:
|
||||
clustersumx = clustersumx + int(xdata[item])
|
||||
clustersumy = clustersumy + int(ydata[item])
|
||||
count = count + 1
|
||||
|
||||
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
|
||||
|
||||
# checking if old clusterpoint is equal to the one just calculated
|
||||
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
|
||||
cpointunchanged = False
|
||||
|
||||
return cpointunchanged
|
||||
|
||||
|
||||
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
|
||||
data_assigned = []
|
||||
assigned_cluster = 0
|
||||
resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy])
|
||||
|
||||
for item in range(0, len(xdata)):
|
||||
olddistance = resetdist
|
||||
for cluster in range(0, clusters):
|
||||
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
|
||||
|
||||
if distance < olddistance:
|
||||
olddistance = distance
|
||||
assigned_cluster = cluster
|
||||
|
||||
data_assigned.append(assigned_cluster)
|
||||
|
||||
return data_assigned
|
||||
|
||||
|
||||
# Startup function for collecting necesarry xdata
|
||||
def startup(xdata, ydata):
|
||||
# Using two clusters for testing
|
||||
clusters = int(input("How many clusters are known? "))
|
||||
# cores = input("How many cores should be used? ")
|
||||
# path = input("Where is the xdata? ") or in this case xdata
|
||||
|
||||
# For benchmarking starting the timer now
|
||||
start_time = time.time()
|
||||
|
||||
# Firing up the engines!
|
||||
kmeansmk1(xdata, ydata, clusters)
|
||||
|
||||
# Stopping benchmark
|
||||
seconds = time.time() - start_time
|
||||
print(str(seconds) + " seconds for execution")
|
||||
|
||||
|
||||
# Start the algorithm and generate test xdata
|
||||
xdata = dmtest.numGenLight(10000, False, 5)
|
||||
ydata = dmtest.numGenLight(10000, False, 2)
|
||||
|
||||
startup(xdata, ydata)
|
||||
Reference in New Issue
Block a user