diff --git a/src/algorithms/kmeansMkI.py b/src/algorithms/kmeansMkI.py index a2c28f7..7b1b498 100644 --- a/src/algorithms/kmeansMkI.py +++ b/src/algorithms/kmeansMkI.py @@ -3,7 +3,7 @@ #description: Our personal Python K-Means++ implementation #author: Tillmann Brendel, Conrad Großer #date: 26.05.2018 -#version: 0.1 +#version: 0.2 #usage: python pyscript.py #notes: #known_issues: @@ -25,29 +25,50 @@ import multiprocessing # CODE # Main function of the algorithm def kmeansmk1(data): + # Using two clusters for testing clusters = 2 - data_size = len(data) + + # Create a new data array for working + new_data = [] + new_data.append(data) + + # Get the size of the data array + data_size = len(new_data[0]) # Defining cluster points for i in range(0, clusters): globals()["cpoint_" + str(i)] = randint(0, data_size) - print("Cluster " + str(i) + ": " + str(data[globals()["cpoint_" + str(i)]])) + print("Cluster " + str(i) + ": " + str(new_data[0][globals()["cpoint_" + str(i)]])) + # Create new array for assigned clusters of each value data_assigned = [] - highPoint = findHighest(data) + # Get max value in the data array + highPoint = findHighest(new_data[0]) + + # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) for item in range(0, data_size): + # Set the minimal cluster difference to the highest difference in the list to ease comparision min_cluster = highPoint + + # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference for cluster in range(0, clusters): clusternumber = globals()["cpoint_" + str(cluster)] - if min_cluster > calcdiff(item, clusternumber, data): - min_cluster = calcdiff(item, clusternumber, data) + if min_cluster > calcdiff(item, clusternumber, new_data[0]): + min_cluster = calcdiff(item, clusternumber, new_data[0]) assinged_cluster = clusternumber + # Assign the minimal difference cluster to the data data_assigned.append(assinged_cluster) + # Add the assigned values list to the new_data array + new_data.append(data_assigned) - # for item in range(0, data_size): - # print("Datapoint: " + str(data[item]) + " | Assigned cluster: " + str(data[data_assigned[item]])) + # Print out the list of datapoints and assigned clusters + for item in range(0, len(new_data[0])): + print("Datapoint: " + str(new_data[0][item]) + " | Assigned cluster: " + str(new_data[0][new_data[1][item]])) + return new_data + +# Determine the highest int value in an array def findHighest(data): maximum = 0 for i in range(0, len(data)): @@ -55,6 +76,7 @@ def findHighest(data): maximum = int(data[i]) return maximum +# Calculate the difference between two points giving the indexes of these data entries def calcdiff(point1, point2, data): if int(data[point2]) > int(data[point1]): difference = int(data[point2]) - int(data[point1]) @@ -63,10 +85,11 @@ def calcdiff(point1, point2, data): # print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) return betrag(difference) -def betrag(zahl): - if zahl < 0: - zahl = int((-2 * zahl) / 2) - return zahl +# Get the absolute value of a number +def betrag(number): + if number < 0: + number = int((-2 * number) / 2) + return number # Startup function for collecting necesarry data def startup(data): @@ -79,7 +102,7 @@ def startup(data): # Firing up the engines! kmeansmk1(data) - # kmeansmk1(clusters, cores, path) + # kmeansmk1(clusters, cores, data) # Stopping benchmark seconds = time.time() - start_time @@ -111,5 +134,6 @@ def generatePLZ(start): plz = plz + str(randint(0,9)) return plz +# Start the algorithm and generate test data data = testgenerator() startup(data) \ No newline at end of file