Misc Cleanup Update

This commit is contained in:
2019-07-04 18:39:01 +02:00
parent 0269c384da
commit 841d47e9c1
7 changed files with 95 additions and 87 deletions

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python
# title: kmeansMkI.py
# description: Our personal Python K-Means++ implementation
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 04.06.2018
# version: 1.6
# usage: python pyscript.py
# notes:
# known_issues:
# python_version: 3.x
# ==============================================================================
# IMPORTS
# Importing the time for benchmarking purposes
import time
from datetime import date
# For random generation of numbers import randint
from random import randint
# Importing libary for multi core processing
import multiprocessing
# Importing libaries for easy plotting
import numpy as np
import matplotlib.pyplot as plt
# Importing own libaries Datamining Libary and Datamining Test
import dmlib
import dmtest
# Main function of the algorithm
def kmeansmk1(xdata, ydata, clusters):
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Get the maximum of the data
highpointx = max(xdata)
highpointy = max(ydata)
# Define variables for running the algorithm (runs is just as important as every other variable)
done = False
runs = 0
# As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters
while not done:
runs += 1
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
# assigned_points consists of the clusternumbers
done = calcClusters(xdata, ydata, assigned_points, clusters)
for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
for i in range(0, clusters):
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
plt.show()
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(xdata, ydata, assigned_points, clusters):
for cluster in range(0, clusters):
cpointunchanged = True
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
clustersumx = 0
clustersumy = 0
count = 0
for item in range(0, len(xdata)):
if assigned_points[item] == cluster:
clustersumx = clustersumx + int(xdata[item])
clustersumy = clustersumy + int(ydata[item])
count = count + 1
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
# checking if old clusterpoint is equal to the one just calculated
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
cpointunchanged = False
return cpointunchanged
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
data_assigned = []
assigned_cluster = 0
resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy])
for item in range(0, len(xdata)):
olddistance = resetdist
for cluster in range(0, clusters):
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
if distance < olddistance:
olddistance = distance
assigned_cluster = cluster
data_assigned.append(assigned_cluster)
return data_assigned
# Startup function for collecting necesarry xdata
def startup(xdata, ydata):
# Using two clusters for testing
clusters = int(input("How many clusters are known? "))
# cores = input("How many cores should be used? ")
# path = input("Where is the xdata? ") or in this case xdata
# For benchmarking starting the timer now
start_time = time.time()
# Firing up the engines!
kmeansmk1(xdata, ydata, clusters)
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test xdata
xdata = dmtest.numGenLight(10000, False, 5)
ydata = dmtest.numGenLight(10000, False, 2)
startup(xdata, ydata)