kmeans Update 0.2

- Added documentation - Added new array for data savings Co-Authored-By: tchemn <tchemn@users.noreply.github.com>
2018-05-26 21:25:57 +02:00
parent c6c73122b0
commit 5f9143b604
1 changed files with 37 additions and 13 deletions
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -3,7 +3,7 @@
 #description:		Our personal Python K-Means++ implementation
 #author:			Tillmann Brendel, Conrad Großer
 #date:				26.05.2018
-#version:			0.1
+#version:			0.2
 #usage:				python pyscript.py
 #notes:
 #known_issues:
@@ -25,29 +25,50 @@ import multiprocessing
 # CODE
 # Main function of the algorithm
 def kmeansmk1(data):
+	# Using two clusters for testing
 	clusters = 2
-	data_size = len(data)
+
+	# Create a new data array for working
+	new_data = []
+	new_data.append(data)
+
+	# Get the size of the data array
+	data_size = len(new_data[0])

 	# Defining cluster points
 	for i in range(0, clusters):
 		globals()["cpoint_" + str(i)] = randint(0, data_size)
-		print("Cluster " + str(i) + ": " + str(data[globals()["cpoint_" + str(i)]]))
+		print("Cluster " + str(i) + ": " + str(new_data[0][globals()["cpoint_" + str(i)]]))

+	# Create new array for assigned clusters of each value
 	data_assigned = []
-	highPoint = findHighest(data)

+	# Get max value in the data array
+	highPoint = findHighest(new_data[0])
+
+	# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
 	for item in range(0, data_size):
+		# Set the minimal cluster difference to the highest difference in the list to ease comparision
 		min_cluster = highPoint
+
+		# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference 
 		for cluster in range(0, clusters):
 			clusternumber = globals()["cpoint_" + str(cluster)]
-			if min_cluster > calcdiff(item, clusternumber, data):
-				min_cluster = calcdiff(item, clusternumber, data)
+			if min_cluster > calcdiff(item, clusternumber, new_data[0]):
+				min_cluster = calcdiff(item, clusternumber, new_data[0])
 				assinged_cluster = clusternumber
+		# Assign the minimal difference cluster to the data
 		data_assigned.append(assinged_cluster)
+	# Add the assigned values list to the new_data array
+	new_data.append(data_assigned)

-	# for item in range(0, data_size):
-	# 	print("Datapoint: " + str(data[item]) + " | Assigned cluster: " + str(data[data_assigned[item]]))
+	# Print out the list of datapoints and assigned clusters
+	for item in range(0, len(new_data[0])):
+		print("Datapoint: " + str(new_data[0][item]) + " | Assigned cluster: " + str(new_data[0][new_data[1][item]]))

+	return new_data
+
+# Determine the highest int value in an array
 def findHighest(data):
 	maximum = 0
 	for i in range(0, len(data)):
@@ -55,6 +76,7 @@ def findHighest(data):
 			maximum = int(data[i])
 	return maximum

+# Calculate the difference between two points giving the indexes of these data entries
 def calcdiff(point1, point2, data):
 	if int(data[point2]) > int(data[point1]):
 		difference = int(data[point2]) - int(data[point1])
@@ -63,10 +85,11 @@ def calcdiff(point1, point2, data):
 	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
 	return betrag(difference)

-def betrag(zahl):
-	if zahl < 0:
-		zahl = int((-2 * zahl) / 2)
-	return zahl
+# Get the absolute value of a number
+def betrag(number):
+	if number < 0:
+		number = int((-2 * number) / 2)
+	return number

 # Startup function for collecting necesarry data
 def startup(data):
@@ -79,7 +102,7 @@ def startup(data):

 	# Firing up the engines!
 	kmeansmk1(data)
-	# kmeansmk1(clusters, cores, path)
+	# kmeansmk1(clusters, cores, data)

 	# Stopping benchmark
 	seconds = time.time() - start_time
@@ -111,5 +134,6 @@ def generatePLZ(start):
 			plz = plz + str(randint(0,9))
 	return plz

+# Start the algorithm and generate test data
 data = testgenerator()
 startup(data)