From 77f3a16c45d59ce8dcf574283e1b4f24746a6260 Mon Sep 17 00:00:00 2001
From: Conrad <grosserconrad@gmail.com>
Date: Wed, 30 May 2018 23:22:02 +0200
Subject: [PATCH] kmeans Update 1.0

- Exported some functions in seperate libaries
- Finished the algorithm, added calcCusters function
- Optimized code
---
 .../__pycache__/dmlib.cpython-36.pyc          | Bin 0 -> 665 bytes
 .../__pycache__/dmtest.cpython-36.pyc         | Bin 0 -> 741 bytes
 src/algorithms/dmlib.py                       |  22 ++++
 src/algorithms/dmtest.py                      |  28 ++++
 src/algorithms/kmeansMkI.py                   | 124 +++++++-----------
 5 files changed, 96 insertions(+), 78 deletions(-)
 create mode 100644 src/algorithms/__pycache__/dmlib.cpython-36.pyc
 create mode 100644 src/algorithms/__pycache__/dmtest.cpython-36.pyc
 create mode 100644 src/algorithms/dmlib.py
 create mode 100644 src/algorithms/dmtest.py

diff --git a/src/algorithms/__pycache__/dmlib.cpython-36.pyc b/src/algorithms/__pycache__/dmlib.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32785811e3169cec640337bc77d3f511213118a9
GIT binary patch
literal 665
zcmZ8e!H&}~5S_8pG<3C4Ry`~N3B)ZDLSpx}TA`Kbfm1IWkV-hDvD>vaaf{=kMM_TH
zpW&B$<<wt5h%%F+rNWYDJTK17d(Y>`$Ajl5k3Rhp06*YhQ4LqL>t_@cD7=FyD4`@}
zRFUdY7ApP#X}^$kg;zwI&+lm07Zf}+xIv1r!JEjvJgiTlm3FxQw#OOq*vJi1SDd6G
zNMn77XWF}bkw(*awItxp-+UG#mHRvll$p=9)3(sx!S!hD&sXD%Y^=ZPa=EIs^;u=C
zc3JI;EH4*JXZ&?lXR0dA?DgtfNQ%5HxCv{n6AFMsJP||T`gAUk5n7z@<*_Xd*>VH!
z)Ps@@jL4p+B(x=*1Voc9|Fq?%+uyt0E=9*3Qk=$iUCp%Xxb8zt9~|F2qFr+Zm=K8$
zyC=ZI+2hXNOFAfQ@%D_yBl$+7H)0H*U$nxD1{(2%kcNF9{i|F2GWtlbLIlOcvDQ?E
zp3Ch*2U%*nrzjndn)a*w)>P{%p!tt$IGP))-kZg>uKi=WTMsAzMmQqY#QPytlT>yF
eb|O0DpbNi~W1%6rtd?qB>bIT!tS@<x#D4(11cS~1

literal 0
HcmV?d00001

diff --git a/src/algorithms/__pycache__/dmtest.cpython-36.pyc b/src/algorithms/__pycache__/dmtest.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd09c6256b73df00ef1b69baa35b369dd8cd477c
GIT binary patch
literal 741
zcmaJ<!EO^V5S_7iH`~ypk$}R16RHr4L{$kDA%swX95|pV5{F1jI4rS~rrF)3V;8k(
za|%DhKky~Ka>_4oV(j+N6PD)r?TmN){OlK<PW$Eky)VB7z#q6Z_#65h=72$g!V)UX
zu@bpJP(sO{kV_S*nCD0(pCM~)c_n1RwWz4oytb}oPA{g@Qcrxr4|x+GcsAb{X0V3K
zq2=~UY{US*zF%X5N-mI|SX9vl*P?zLG7<H8ju$|Et|Xz7GHZF(z?o>o1`*DCBe1do
z&l_M_=?Nk4v{fFSGUVducsmT__{c#PxtImdw7WCYwWh-A;iuds#cHK%MP70y^Bq;#
z;vG?O<#6ugs{HW_sKd3t&j+u^1N~i>%T=XoJFe#3WNc_MF3Q=G=JvEQV^vvg?BVLl
zbv(J*Xh|`5F!vY$BkW;{ZN|1pvCEwHI6v{?{qRB(j^N#tiRJTcP~k$*D<#&*KC*M{
znfGy@JwYq1WM|_IJM#e6c>ZVv^|OZA<w01R{AcaQ5j5zp_1Zsb@s``*q*t-s@enbv
zf3nmyrOcf)mdHPyOuE?Ef-LR&Je~!!uA$2r^8erEFg*sq-EjVg!n?4Kp?Z?VF7Z!k
jS%sG2DuO4We5;|q2l8EA`lec{i&DS!8jP3MlWF%KTyUBs

literal 0
HcmV?d00001

diff --git a/src/algorithms/dmlib.py b/src/algorithms/dmlib.py
new file mode 100644
index 0000000..0d0fc71
--- /dev/null
+++ b/src/algorithms/dmlib.py
@@ -0,0 +1,22 @@
+# Calculate the difference between two points giving the indexes of these data entries
+def calcdiff(point1, point2, data):
+	if int(point2) > int(point1):
+		difference = int(point2) - int(point1)
+	else:
+		difference = int(point1) - int(point2)
+	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
+	return betrag(difference)
+
+# Get the absolute value of a number and returns it as int
+def betrag(number):
+	if number < 0:
+		number = int((-2 * number) / 2)
+	return number
+
+# Determine the highest int value in an array and returns is as an int
+def findHighest(data):
+	maximum = 0
+	for i in range(0, len(data)):
+		if int(data[i]) > maximum:
+			maximum = int(data[i])
+	return maximum
\ No newline at end of file
diff --git a/src/algorithms/dmtest.py b/src/algorithms/dmtest.py
new file mode 100644
index 0000000..686d126
--- /dev/null
+++ b/src/algorithms/dmtest.py
@@ -0,0 +1,28 @@
+# For random generation of numbers import randint
+from random import randint, shuffle
+
+# Simple generator for test data (100 plzs, 20-30-50 biased), returns 1D array of plzs 
+def testgenerator():
+	dataArray = []
+	for i in range(0,100):
+		if i <= 40:
+			plz = generatePLZ("05")
+		elif i > 40 and i < 80:
+			plz = generatePLZ("50")
+		else:
+			plz = generatePLZ("")
+		dataArray.append(plz)
+	shuffle(dataArray)
+	return dataArray
+
+# Generates a PLZ from a certain start point
+def generatePLZ(start):
+	if len(start) == 0:
+		plz = ""
+		for j in range(1,6):
+			plz = plz + str(randint(0,9))
+	else:
+		plz = start
+		for j in range(1,4):
+			plz = plz + str(randint(0,9))
+	return plz
diff --git a/src/algorithms/kmeansMkI.py b/src/algorithms/kmeansMkI.py
index 7b1b498..a7936b1 100644
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -3,7 +3,7 @@
 #description:		Our personal Python K-Means++ implementation
 #author:			Tillmann Brendel, Conrad Großer
 #date:				26.05.2018
-#version:			0.2
+#version:			1.0
 #usage:				python pyscript.py
 #notes:
 #known_issues:
@@ -16,124 +16,92 @@
 import time
 from datetime import date
 
-# For random generation of numbers import randint and shuffle to shuffle an array
-from random import randint, shuffle
+# For random generation of numbers import randint
+from random import randint
 
 # Importing libary for multi core processing
 import multiprocessing
 
+# Importing own libaries Datamining Libary and Datamining Test
+import dmlib
+import dmtest
+
 # CODE
 # Main function of the algorithm
-def kmeansmk1(data):
-	# Using two clusters for testing
-	clusters = 2
+def kmeansmk1(data, clusters, runs):
+	# Defining cluster points
+	for i in range(0, clusters):
+		globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
+		print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
 
+	# Get max value in the data array
+	highPoint = dmlib.findHighest(data)
+
+	for run in range(0, runs):
+		new_data = assignCluster(data, highPoint, clusters)
+		calcClusters(new_data, clusters)
+
+	return 0
+
+# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
+def calcClusters(data, clusters):
+	for cluster in range(0, clusters):
+		clustersum = 0
+		count = 0
+		for item in range(0, len(data[0])):
+			if data[1][item] == globals()["cpoint_" + str(cluster)]:
+				clustersum = clustersum + int(data[0][item])
+				count = count + 1
+		globals()["cpoint_" + str(cluster)] = round(clustersum / count)
+	return 0
+
+def assignCluster(data, highPoint, clusters):
 	# Create a new data array for working
 	new_data = []
 	new_data.append(data)
 
-	# Get the size of the data array
-	data_size = len(new_data[0])
-
-	# Defining cluster points
-	for i in range(0, clusters):
-		globals()["cpoint_" + str(i)] = randint(0, data_size)
-		print("Cluster " + str(i) + ": " + str(new_data[0][globals()["cpoint_" + str(i)]]))
-
 	# Create new array for assigned clusters of each value
 	data_assigned = []
 
-	# Get max value in the data array
-	highPoint = findHighest(new_data[0])
-
 	# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
-	for item in range(0, data_size):
+	for item in range(0, len(new_data[0])):
 		# Set the minimal cluster difference to the highest difference in the list to ease comparision
 		min_cluster = highPoint
 
 		# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference 
 		for cluster in range(0, clusters):
-			clusternumber = globals()["cpoint_" + str(cluster)]
-			if min_cluster > calcdiff(item, clusternumber, new_data[0]):
-				min_cluster = calcdiff(item, clusternumber, new_data[0])
-				assinged_cluster = clusternumber
+			if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0]):
+				min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0])
+				assinged_cluster = globals()["cpoint_" + str(cluster)]
 		# Assign the minimal difference cluster to the data
 		data_assigned.append(assinged_cluster)
 	# Add the assigned values list to the new_data array
 	new_data.append(data_assigned)
 
-	# Print out the list of datapoints and assigned clusters
-	for item in range(0, len(new_data[0])):
-		print("Datapoint: " + str(new_data[0][item]) + " | Assigned cluster: " + str(new_data[0][new_data[1][item]]))
-
 	return new_data
 
-# Determine the highest int value in an array
-def findHighest(data):
-	maximum = 0
-	for i in range(0, len(data)):
-		if int(data[i]) > maximum:
-			maximum = int(data[i])
-	return maximum
-
-# Calculate the difference between two points giving the indexes of these data entries
-def calcdiff(point1, point2, data):
-	if int(data[point2]) > int(data[point1]):
-		difference = int(data[point2]) - int(data[point1])
-	else:
-		difference = int(data[point1]) - int(data[point2])
-	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
-	return betrag(difference)
-
-# Get the absolute value of a number
-def betrag(number):
-	if number < 0:
-		number = int((-2 * number) / 2)
-	return number
-
 # Startup function for collecting necesarry data
 def startup(data):
+	# Using two clusters for testing
 	# clusters = int(input("How many clusters are known? "))
+	clusters = 2
 	# cores = input("How many cores should be used? ")
 	# path = input("Where is the data? ") or in this case data
 
+	# runs = int(input("How many runs are sufficient? "))
+	runs = 500
+	
 	# For benchmarking starting the timer now
 	start_time = time.time()
 
 	# Firing up the engines!
-	kmeansmk1(data)
-	# kmeansmk1(clusters, cores, data)
+	kmeansmk1(data, clusters, runs)
 
 	# Stopping benchmark
 	seconds = time.time() - start_time
 	# print(str(seconds) + " seconds for execution")
 
-# Simple generator for test data
-def testgenerator():
-	dataArray = []
-	for i in range(0,100):
-		if i <= 20:
-			plz = generatePLZ("09")
-		elif i > 20 and i < 50:
-			plz = generatePLZ("08")
-		else:
-			plz = generatePLZ("")
-		dataArray.append(plz)
-	shuffle(dataArray)
-	return dataArray
-
-# Generates a PLZ from a certain start point
-def generatePLZ(start):
-	if len(start) == 0:
-		plz = ""
-		for j in range(1,6):
-			plz = plz + str(randint(0,9))
-	else:
-		plz = start
-		for j in range(1,4):
-			plz = plz + str(randint(0,9))
-	return plz
-
 # Start the algorithm and generate test data
-data = testgenerator()
+data = dmtest.testgenerator()
+
 startup(data)
\ No newline at end of file