kmeans Update 1.2

- Added dependencies to info (adjusted template too)
- Removed unnecessary global variables
- Added commentary
- Saved a few variables
- Removed unnecessary libary 'numpy' and 'multiprocessing'
This commit is contained in:
2018-06-01 03:15:33 +02:00
parent e1d794c006
commit 7ea392c302
2 changed files with 29 additions and 26 deletions

View File

@@ -4,9 +4,10 @@
#author: Tillmann Brendel, Conrad Großer #author: Tillmann Brendel, Conrad Großer
#license: Pending #license: Pending
#date: 26.05.2018 #date: 26.05.2018
#version: 1.1 #version: 1.2
#usage: python pyscript.py #usage: python pyscript.py
#notes: #notes:
#dependencies: mathplotlib
#known_issues: #known_issues:
#python_version: 3.x #python_version: 3.x
#============================================================================== #==============================================================================
@@ -20,11 +21,7 @@ from datetime import date
# For random generation of numbers import randint # For random generation of numbers import randint
from random import randint from random import randint
# Importing libary for multi core processing
import multiprocessing
# Importing libaries for easy plotting # Importing libaries for easy plotting
import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
# Importing own libaries Datamining Libary and Datamining Test # Importing own libaries Datamining Libary and Datamining Test
@@ -41,54 +38,59 @@ def kmeansmk1(data, clusters):
# Get max value in the data array # Get max value in the data array
highPoint = dmlib.findHighest(data) highPoint = dmlib.findHighest(data)
# Define variables for running the algorithm (runs is just for benchmarking!)
done = 0 done = 0
runs = 0 runs = 0
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while done == 0: while done == 0:
runs = runs + 1 runs = runs + 1
new_data = assignCluster(data, highPoint, clusters) new_data = assignCluster(data, highPoint, clusters)
calcClusters(new_data, clusters) done = calcClusters(new_data, clusters)
for cluster in range(0, clusters):
#keeps the algorithm going until the central clusterpoint doesnt change anymore
if globals()["cpointchanged_" + str(cluster)] == 1:
done = 1
# Printing final clusters # Printing final clusters
for i in range(0, clusters): for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
# plotting the random data and the found clusters
anew = [] anew = []
inew = 0 inew = 0
while inew < 1000: while inew < len(data):
anew.append(inew) anew.append(inew)
inew = inew + 1 inew = inew + 1
floatdata = [int(x) for x in data]
# Drawing found clusters as lines
for i in range(0, clusters): for i in range(0, clusters):
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r') plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
plt.scatter(floatdata, anew, marker='x', s=7, color='k')
# Showing graph
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
plt.show() plt.show()
return 0 return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters): def calcClusters(data, clusters):
changed = 0
for cluster in range(0, clusters): for cluster in range(0, clusters):
globals()["cpointchanged_" + str(cluster)] = 0 # Getting current cluster and saving it in temporary variable
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)] prev_cluster = globals()["cpoint_" + str(cluster)]
# Sum of the cluster to calculate average difference between cluster center and data points
clustersum = 0 clustersum = 0
count = 0 item_count = 0
for item in range(0, len(data[0])): for item in range(0, len(data[0])):
if data[1][item] == globals()["cpoint_" + str(cluster)]: if data[1][item] == globals()["cpoint_" + str(cluster)]:
clustersum = clustersum + int(data[0][item]) clustersum = clustersum + int(data[0][item])
count = count + 1 item_count = item_count + 1
globals()["cpoint_" + str(cluster)] = round(clustersum / count) globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
#checking if old clusterpoint is equal to the one just calculated # Checking if previous clusterpoint is equal to the one just calculated
if globals()["oldcpoint_" + str(cluster)] == globals()["cpoint_" + str(cluster)]: if prev_cluster == globals()["cpoint_" + str(cluster)]:
globals()["cpointchanged_" + str(cluster)] = 1 changed = 1
return 0
return changed
def assignCluster(data, highPoint, clusters): def assignCluster(data, highPoint, clusters):
# Create a new data array for working # Create a new data array for working

View File

@@ -7,6 +7,7 @@
#version: Versionnumber #version: Versionnumber
#usage: Description of how to use the programm quickly #usage: Description of how to use the programm quickly
#notes: Notes for parameters, thanks (...) #notes: Notes for parameters, thanks (...)
#dependencies: Preinstalled packages
#known_issues: Known issues in this version #known_issues: Known issues in this version
#python_version: Compatible (tested) python version #python_version: Compatible (tested) python version
#============================================================================== #==============================================================================