From 77f3a16c45d59ce8dcf574283e1b4f24746a6260 Mon Sep 17 00:00:00 2001 From: Conrad Date: Wed, 30 May 2018 23:22:02 +0200 Subject: [PATCH] kmeans Update 1.0 - Exported some functions in seperate libaries - Finished the algorithm, added calcCusters function - Optimized code --- .../__pycache__/dmlib.cpython-36.pyc | Bin 0 -> 665 bytes .../__pycache__/dmtest.cpython-36.pyc | Bin 0 -> 741 bytes src/algorithms/dmlib.py | 22 ++++ src/algorithms/dmtest.py | 28 ++++ src/algorithms/kmeansMkI.py | 124 +++++++----------- 5 files changed, 96 insertions(+), 78 deletions(-) create mode 100644 src/algorithms/__pycache__/dmlib.cpython-36.pyc create mode 100644 src/algorithms/__pycache__/dmtest.cpython-36.pyc create mode 100644 src/algorithms/dmlib.py create mode 100644 src/algorithms/dmtest.py diff --git a/src/algorithms/__pycache__/dmlib.cpython-36.pyc b/src/algorithms/__pycache__/dmlib.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32785811e3169cec640337bc77d3f511213118a9 GIT binary patch literal 665 zcmZ8e!H&}~5S_8pG<3C4Ry`~N3B)ZDLSpx}TA`Kbfm1IWkV-hDvD>vaaf{=kMM_TH zpW&B$<`$Ajl5k3Rhp06*YhQ4LqL>t_@cD7=FyD4`@} zRFUdY7ApP#X}^$kg;zwI&+lm07Zf}+xIv1r!JEjvJgiTlm3FxQw#OOq*vJi1SDd6G zNMn77XWF}bkw(*awItxp-+UG#mHRvll$p=9)3(sx!S!hD&sXD%Y^=ZPa=EIs^;u=C zc3JI;EH4*JXZ&?lXR0dA?DgtfNQ%5HxCv{n6AFMsJP||T`gAUk5n7z@<*_Xd*>VH! z)Ps@@jL4p+B(x=*1Voc9|Fq?%+uyt0E=9*3Qk=$iUCp%Xxb8zt9~|F2qFr+Zm=K8$ zyC=ZI+2hXNOFAfQ@%D_yBl$+7H)0H*U$nxD1{(2%kcNF9{i|F2GWtlbLIlOcvDQ?E zp3Ch*2U%*nrzjndn)a*w)>P{%p!tt$IGP))-kZg>uKi=WTMsAzMmQqY#QPytlT>yF eb|O0DpbNi~W1%6rtd?qB>bIT!tS@_4oV(j+N6PD)r?TmN){OlKo1`*DCBe1do z&l_M_=?Nk4v{fFSGUVducsmT__{c#PxtImdw7WCYwWh-A;iuds#cHK%MP70y^Bq;# z;vG?O<#6ugs{HW_sKd3t&j+u^1N~i>%T=XoJFe#3WNc_MF3Q=G=JvEQV^vvg?BVLl zbv(J*Xh|`5F!vY$BkW;{ZN|1pvCEwHI6v{?{qRB(j^N#tiRJTcP~k$*D<#&*KC*M{ znfGy@JwYq1WM|_IJM#e6c>ZVv^|OZA int(point1): + difference = int(point2) - int(point1) + else: + difference = int(point1) - int(point2) + # print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) + return betrag(difference) + +# Get the absolute value of a number and returns it as int +def betrag(number): + if number < 0: + number = int((-2 * number) / 2) + return number + +# Determine the highest int value in an array and returns is as an int +def findHighest(data): + maximum = 0 + for i in range(0, len(data)): + if int(data[i]) > maximum: + maximum = int(data[i]) + return maximum \ No newline at end of file diff --git a/src/algorithms/dmtest.py b/src/algorithms/dmtest.py new file mode 100644 index 0000000..686d126 --- /dev/null +++ b/src/algorithms/dmtest.py @@ -0,0 +1,28 @@ +# For random generation of numbers import randint +from random import randint, shuffle + +# Simple generator for test data (100 plzs, 20-30-50 biased), returns 1D array of plzs +def testgenerator(): + dataArray = [] + for i in range(0,100): + if i <= 40: + plz = generatePLZ("05") + elif i > 40 and i < 80: + plz = generatePLZ("50") + else: + plz = generatePLZ("") + dataArray.append(plz) + shuffle(dataArray) + return dataArray + +# Generates a PLZ from a certain start point +def generatePLZ(start): + if len(start) == 0: + plz = "" + for j in range(1,6): + plz = plz + str(randint(0,9)) + else: + plz = start + for j in range(1,4): + plz = plz + str(randint(0,9)) + return plz diff --git a/src/algorithms/kmeansMkI.py b/src/algorithms/kmeansMkI.py index 7b1b498..a7936b1 100644 --- a/src/algorithms/kmeansMkI.py +++ b/src/algorithms/kmeansMkI.py @@ -3,7 +3,7 @@ #description: Our personal Python K-Means++ implementation #author: Tillmann Brendel, Conrad Großer #date: 26.05.2018 -#version: 0.2 +#version: 1.0 #usage: python pyscript.py #notes: #known_issues: @@ -16,124 +16,92 @@ import time from datetime import date -# For random generation of numbers import randint and shuffle to shuffle an array -from random import randint, shuffle +# For random generation of numbers import randint +from random import randint # Importing libary for multi core processing import multiprocessing +# Importing own libaries Datamining Libary and Datamining Test +import dmlib +import dmtest + # CODE # Main function of the algorithm -def kmeansmk1(data): - # Using two clusters for testing - clusters = 2 +def kmeansmk1(data, clusters, runs): + # Defining cluster points + for i in range(0, clusters): + globals()["cpoint_" + str(i)] = data[randint(0, len(data))] + print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) + # Get max value in the data array + highPoint = dmlib.findHighest(data) + + for run in range(0, runs): + new_data = assignCluster(data, highPoint, clusters) + calcClusters(new_data, clusters) + + return 0 + +# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) +def calcClusters(data, clusters): + for cluster in range(0, clusters): + clustersum = 0 + count = 0 + for item in range(0, len(data[0])): + if data[1][item] == globals()["cpoint_" + str(cluster)]: + clustersum = clustersum + int(data[0][item]) + count = count + 1 + globals()["cpoint_" + str(cluster)] = round(clustersum / count) + return 0 + +def assignCluster(data, highPoint, clusters): # Create a new data array for working new_data = [] new_data.append(data) - # Get the size of the data array - data_size = len(new_data[0]) - - # Defining cluster points - for i in range(0, clusters): - globals()["cpoint_" + str(i)] = randint(0, data_size) - print("Cluster " + str(i) + ": " + str(new_data[0][globals()["cpoint_" + str(i)]])) - # Create new array for assigned clusters of each value data_assigned = [] - # Get max value in the data array - highPoint = findHighest(new_data[0]) - # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) - for item in range(0, data_size): + for item in range(0, len(new_data[0])): # Set the minimal cluster difference to the highest difference in the list to ease comparision min_cluster = highPoint # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference for cluster in range(0, clusters): - clusternumber = globals()["cpoint_" + str(cluster)] - if min_cluster > calcdiff(item, clusternumber, new_data[0]): - min_cluster = calcdiff(item, clusternumber, new_data[0]) - assinged_cluster = clusternumber + if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0]): + min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0]) + assinged_cluster = globals()["cpoint_" + str(cluster)] # Assign the minimal difference cluster to the data data_assigned.append(assinged_cluster) # Add the assigned values list to the new_data array new_data.append(data_assigned) - # Print out the list of datapoints and assigned clusters - for item in range(0, len(new_data[0])): - print("Datapoint: " + str(new_data[0][item]) + " | Assigned cluster: " + str(new_data[0][new_data[1][item]])) - return new_data -# Determine the highest int value in an array -def findHighest(data): - maximum = 0 - for i in range(0, len(data)): - if int(data[i]) > maximum: - maximum = int(data[i]) - return maximum - -# Calculate the difference between two points giving the indexes of these data entries -def calcdiff(point1, point2, data): - if int(data[point2]) > int(data[point1]): - difference = int(data[point2]) - int(data[point1]) - else: - difference = int(data[point1]) - int(data[point2]) - # print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) - return betrag(difference) - -# Get the absolute value of a number -def betrag(number): - if number < 0: - number = int((-2 * number) / 2) - return number - # Startup function for collecting necesarry data def startup(data): + # Using two clusters for testing # clusters = int(input("How many clusters are known? ")) + clusters = 2 # cores = input("How many cores should be used? ") # path = input("Where is the data? ") or in this case data + # runs = int(input("How many runs are sufficient? ")) + runs = 500 + # For benchmarking starting the timer now start_time = time.time() # Firing up the engines! - kmeansmk1(data) - # kmeansmk1(clusters, cores, data) + kmeansmk1(data, clusters, runs) # Stopping benchmark seconds = time.time() - start_time # print(str(seconds) + " seconds for execution") -# Simple generator for test data -def testgenerator(): - dataArray = [] - for i in range(0,100): - if i <= 20: - plz = generatePLZ("09") - elif i > 20 and i < 50: - plz = generatePLZ("08") - else: - plz = generatePLZ("") - dataArray.append(plz) - shuffle(dataArray) - return dataArray - -# Generates a PLZ from a certain start point -def generatePLZ(start): - if len(start) == 0: - plz = "" - for j in range(1,6): - plz = plz + str(randint(0,9)) - else: - plz = start - for j in range(1,4): - plz = plz + str(randint(0,9)) - return plz - # Start the algorithm and generate test data -data = testgenerator() +data = dmtest.testgenerator() + startup(data) \ No newline at end of file