kmeans Update 1.0

- Exported some functions in seperate libaries - Finished the algorithm, added calcCusters function - Optimized code
2018-05-30 23:22:02 +02:00
parent 5f9143b604
commit 77f3a16c45
5 changed files with 96 additions and 78 deletions
--- a/src/algorithms/pycache/dmlib.cpython-36.pyc
+++ b/src/algorithms/pycache/dmlib.cpython-36.pyc
--- a/src/algorithms/pycache/dmtest.cpython-36.pyc
+++ b/src/algorithms/pycache/dmtest.cpython-36.pyc
--- a/src/algorithms/dmlib.py
+++ b/src/algorithms/dmlib.py
@@ -0,0 +1,22 @@
 # Calculate the difference between two points giving the indexes of these data entries
 def calcdiff(point1, point2, data):
 	if int(point2) > int(point1):
 		difference = int(point2) - int(point1)
 	else:
 		difference = int(point1) - int(point2)
 	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
 	return betrag(difference)
 # Get the absolute value of a number and returns it as int
 def betrag(number):
 	if number < 0:
 		number = int((-2 * number) / 2)
 	return number
 # Determine the highest int value in an array and returns is as an int
 def findHighest(data):
 	maximum = 0
 	for i in range(0, len(data)):
 		if int(data[i]) > maximum:
 			maximum = int(data[i])
 	return maximum
--- a/src/algorithms/dmtest.py
+++ b/src/algorithms/dmtest.py
@@ -0,0 +1,28 @@
 # For random generation of numbers import randint
 from random import randint, shuffle
 # Simple generator for test data (100 plzs, 20-30-50 biased), returns 1D array of plzs 
 def testgenerator():
 	dataArray = []
 	for i in range(0,100):
 		if i <= 40:
 			plz = generatePLZ("05")
 		elif i > 40 and i < 80:
 			plz = generatePLZ("50")
 		else:
 			plz = generatePLZ("")
 		dataArray.append(plz)
 	shuffle(dataArray)
 	return dataArray
 # Generates a PLZ from a certain start point
 def generatePLZ(start):
 	if len(start) == 0:
 		plz = ""
 		for j in range(1,6):
 			plz = plz + str(randint(0,9))
 	else:
 		plz = start
 		for j in range(1,4):
 			plz = plz + str(randint(0,9))
 	return plz
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -3,7 +3,7 @@
 #description:		Our personal Python K-Means++ implementation
 #author:			Tillmann Brendel, Conrad Großer
 #date:				26.05.2018
-#version:			0.2
+#version:			1.0
 #usage:				python pyscript.py
 #notes:
 #known_issues:
@@ -16,124 +16,92 @@
 import time
 from datetime import date
-# For random generation of numbers import randint and shuffle to shuffle an array
+# For random generation of numbers import randint
-from random import randint, shuffle
+from random import randint
 # Importing libary for multi core processing
 import multiprocessing
 # Importing own libaries Datamining Libary and Datamining Test
 import dmlib
 import dmtest
 # CODE
 # Main function of the algorithm
-def kmeansmk1(data):
+def kmeansmk1(data, clusters, runs):
-	# Using two clusters for testing
+	# Defining cluster points
-	clusters = 2
+	for i in range(0, clusters):
 		globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
 		print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
 	# Get max value in the data array
 	highPoint = dmlib.findHighest(data)
 	for run in range(0, runs):
 		new_data = assignCluster(data, highPoint, clusters)
 		calcClusters(new_data, clusters)
 	return 0
 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
 def calcClusters(data, clusters):
 	for cluster in range(0, clusters):
 		clustersum = 0
 		count = 0
 		for item in range(0, len(data[0])):
 			if data[1][item] == globals()["cpoint_" + str(cluster)]:
 				clustersum = clustersum + int(data[0][item])
 				count = count + 1
 		globals()["cpoint_" + str(cluster)] = round(clustersum / count)
 	return 0
 def assignCluster(data, highPoint, clusters):
 	# Create a new data array for working
 	new_data = []
 	new_data.append(data)
 	# Get the size of the data array
 	data_size = len(new_data[0])
 	# Defining cluster points
 	for i in range(0, clusters):
 		globals()["cpoint_" + str(i)] = randint(0, data_size)
 		print("Cluster " + str(i) + ": " + str(new_data[0][globals()["cpoint_" + str(i)]]))
 	# Create new array for assigned clusters of each value
 	data_assigned = []
 	# Get max value in the data array
 	highPoint = findHighest(new_data[0])
 	# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
-	for item in range(0, data_size):
+	for item in range(0, len(new_data[0])):
 		# Set the minimal cluster difference to the highest difference in the list to ease comparision
 		min_cluster = highPoint
 		# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference 
 		for cluster in range(0, clusters):
-			clusternumber = globals()["cpoint_" + str(cluster)]
+			if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0]):
-			if min_cluster > calcdiff(item, clusternumber, new_data[0]):
+				min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0])
-				min_cluster = calcdiff(item, clusternumber, new_data[0])
+				assinged_cluster = globals()["cpoint_" + str(cluster)]
 				assinged_cluster = clusternumber
 		# Assign the minimal difference cluster to the data
 		data_assigned.append(assinged_cluster)
 	# Add the assigned values list to the new_data array
 	new_data.append(data_assigned)
 	# Print out the list of datapoints and assigned clusters
 	for item in range(0, len(new_data[0])):
 		print("Datapoint: " + str(new_data[0][item]) + " | Assigned cluster: " + str(new_data[0][new_data[1][item]]))
 	return new_data
 # Determine the highest int value in an array
 def findHighest(data):
 	maximum = 0
 	for i in range(0, len(data)):
 		if int(data[i]) > maximum:
 			maximum = int(data[i])
 	return maximum
 # Calculate the difference between two points giving the indexes of these data entries
 def calcdiff(point1, point2, data):
 	if int(data[point2]) > int(data[point1]):
 		difference = int(data[point2]) - int(data[point1])
 	else:
 		difference = int(data[point1]) - int(data[point2])
 	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
 	return betrag(difference)
 # Get the absolute value of a number
 def betrag(number):
 	if number < 0:
 		number = int((-2 * number) / 2)
 	return number
 # Startup function for collecting necesarry data
 def startup(data):
 	# Using two clusters for testing
 	# clusters = int(input("How many clusters are known? "))
 	clusters = 2
 	# cores = input("How many cores should be used? ")
 	# path = input("Where is the data? ") or in this case data
 	# runs = int(input("How many runs are sufficient? "))
 	runs = 500
 	# For benchmarking starting the timer now
 	start_time = time.time()
 	# Firing up the engines!
-	kmeansmk1(data)
+	kmeansmk1(data, clusters, runs)
 	# kmeansmk1(clusters, cores, data)
 	# Stopping benchmark
 	seconds = time.time() - start_time
 	# print(str(seconds) + " seconds for execution")
 # Simple generator for test data
 def testgenerator():
 	dataArray = []
 	for i in range(0,100):
 		if i <= 20:
 			plz = generatePLZ("09")
 		elif i > 20 and i < 50:
 			plz = generatePLZ("08")
 		else:
 			plz = generatePLZ("")
 		dataArray.append(plz)
 	shuffle(dataArray)
 	return dataArray
 # Generates a PLZ from a certain start point
 def generatePLZ(start):
 	if len(start) == 0:
 		plz = ""
 		for j in range(1,6):
 			plz = plz + str(randint(0,9))
 	else:
 		plz = start
 		for j in range(1,4):
 			plz = plz + str(randint(0,9))
 	return plz
 # Start the algorithm and generate test data
-data = testgenerator()
+data = dmtest.testgenerator()
 startup(data)