diff --git a/src/algorithms/dmlib.py b/src/algorithms/dmlib.py index 25d9453..514f98b 100644 --- a/src/algorithms/dmlib.py +++ b/src/algorithms/dmlib.py @@ -1,5 +1,7 @@ +# Imports for dmlib import math + # Calculate the difference between two points giving the indexes of these xdata entries def calcdiff(point1, point2): if int(point2) > int(point1): @@ -8,23 +10,10 @@ def calcdiff(point1, point2): difference = int(point1) - int(point2) return difference + # Calculate the difference between two points in 2D space def calcdiff2d(point1, point2): point1 = [int(i) for i in point1] point2 = [int(i) for i in point2] difference = math.sqrt(((point2[0]) - (point1[0])) ** 2 + ((point2[1]) - (point1[1])) ** 2) - return betrag(difference) - -# Get the absolute value of a number and returns it as int -def betrag(number): - if number < 0: - number = int((-2 * number) / 2) - return number - -# Determine the highest int value in an array and returns is as an int -def findHighest(data): - maximum = 0 - for i in range(0, len(data)): - if int(data[i]) > maximum: - maximum = int(data[i]) - return maximum + return abs(difference) diff --git a/src/algorithms/dmtest.py b/src/algorithms/dmtest.py index 8cff71d..1f6b4c8 100644 --- a/src/algorithms/dmtest.py +++ b/src/algorithms/dmtest.py @@ -1,55 +1,64 @@ # For random generation of numbers import randint from random import randint, shuffle + # Simple generator for test nums (40-40-20 biased), returns 1D array of nums def numGenLight(entries, shuffle, num_lenght): - dataArray = [] - for i in range(0, int(entries)): - if i < round(entries * 0.4): - num = generateNumber(num_lenght, 2) - elif i >= round(entries * 0.4) and i < round(entries * 0.6): - num = generateNumber(num_lenght, 9) - elif i >= round(entries * 0.6) and i < round(entries * 0.9): - num = generateNumber(num_lenght, 4) - else: - num = generateNumber(num_lenght, randint(0,9)) - dataArray.append(num) - if shuffle: - shuffle(dataArray) - return dataArray + dataArray = [] + for i in range(0, int(entries)): + if i < round(entries * 0.4): + num = generateNumber(num_lenght, 2) + elif i >= round(entries * 0.4) and i < round(entries * 0.6): + num = generateNumber(num_lenght, 9) + elif i >= round(entries * 0.6) and i < round(entries * 0.9): + num = generateNumber(num_lenght, 4) + else: + num = generateNumber(num_lenght, randint(0, 9)) + dataArray.append(num) + if shuffle: + shuffle(dataArray) + return dataArray + # Function for generating the content of one single row randomly def generateNumber(numberLenght, startingNumber): - number = str(startingNumber) - for length in range(0, numberLenght - 1): - number = number + str(randint(0,9)) - return number + number = str(startingNumber) + for length in range(0, numberLenght - 1): + number = number + str(randint(0, 9)) + return number -# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming) + +# Function for writing data into a file +# content = string, nameChunkStart and namePartStart are for better naming # /testdata/ folder has to be created at this point def writeFile(content, nameChunkStart, namePartStart): - filenumber = int(nameChunkStart) + int(namePartStart) - file = open("testdata/file" + str(filenumber) + ".txt", "w") - for w in range(0, len(content)): - file.write(content[w] + "\n") + filenumber = int(nameChunkStart) + int(namePartStart) + file = open("testdata/file" + str(filenumber) + ".txt", "w") + for w in range(0, len(content)): + file.write(content[w] + "\n") + # Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters def numGen(entries, cluster, int_lenght, suffle_value): - dataArray = [] - clusterArray = [] + dataArray = [] + clusterArray = [] - for cluster_num in range(0, cluster): - clusterArray.append(randint(10,99)) + for cluster_num in range(0, cluster): + clusterArray.append(randint(10, 99)) - for item in range(0, entries): - decider = randint(0, 2) - if decider == 2: - dataArray.append(generateNumber(int_lenght, randint(1,9))) - else: - cluster_decider = randint(0, cluster - 1) - dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider])) + for item in range(0, entries): + decider = randint(0, 2) + if decider == 2: + dataArray.append(generateNumber(int_lenght, randint(1, 9))) + else: + cluster_decider = randint(0, cluster - 1) + dataArray.append( + generateNumber( + int_lenght - 1, + clusterArray[cluster_decider] + )) - if suffle_value: - shuffle(dataArray) + if suffle_value: + shuffle(dataArray) - return dataArray + return dataArray diff --git a/src/algorithms/kmeansMkI.py b/src/algorithms/kmeansMkI.py index b6928a9..aa001ec 100644 --- a/src/algorithms/kmeansMkI.py +++ b/src/algorithms/kmeansMkI.py @@ -1,16 +1,16 @@ #!/usr/bin/env python -#title: kmeansMkI.py -#description: Our personal Python K-Means++ implementation -#author: Tillmann Brendel, Conrad Großer -#license: Pending -#date: 26.05.2018 -#version: 1.2 -#usage: python pyscript.py -#notes: -#dependencies: mathplotlib -#known_issues: When clusters are 'thin' or noice is to strong --> unaccurate -#python_version: 3.x -#============================================================================== +# title: kmeansMkI.py +# description: Our personal Python K-Means++ implementation +# author: Tillmann Brendel, Conrad Großer +# license: Pending +# date: 26.05.2018 +# version: 1.3 +# usage: python pyscript.py +# notes: +# dependencies: matplotlib +# known_issues: When clusters are 'thin' or noice is to strong --> inaccurate +# python_version: 3.x +# ============================================================================== # IMPORTS @@ -28,112 +28,117 @@ import matplotlib.pyplot as plt import dmlib import dmtest + # CODE # Main function of the algorithm def kmeansmk1(data, clusters): - # Defining cluster points - for i in range(0, clusters): - globals()["cpoint_" + str(i)] = data[randint(0, len(data))] - print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) + # Defining cluster points + for i in range(0, clusters): + globals()["cpoint_" + str(i)] = data[randint(0, len(data))] + print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) - # Get max value in the data array - highPoint = dmlib.findHighest(data) + # Get max value in the data array + highPoint = max(data) - # Define variables for running the algorithm (runs is just for benchmarking!) - done = 0 - runs = 0 + # Define variables for running the algorithm (runs is just for benchmarking!) + done = False + runs = 0 - # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters - while done == 0: - runs = runs + 1 - new_data = assignCluster(data, highPoint, clusters) - done = calcClusters(new_data, clusters) + # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters + while not done: + runs += 1 + new_data = assignCluster(data, highPoint, clusters) + done = calcClusters(new_data, clusters) - # Printing final clusters - for i in range(0, clusters): - print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") + # Printing final clusters + for i in range(0, clusters): + print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") - # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data - anew = [] - inew = 0 - while inew < len(data): - anew.append(inew) - inew = inew + 1 + # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data + anew = [] + inew = 0 + while inew < len(data): + anew.append(inew) + inew = inew + 1 - # Drawing found clusters as lines - for i in range(0, clusters): - plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r') + # Drawing found clusters as lines + for i in range(0, clusters): + plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r') - # Showing graph - plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k') - plt.show() + # Showing graph + plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k') + plt.show() + + return 0 - return 0 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) def calcClusters(data, clusters): - changed = 0 - for cluster in range(0, clusters): - # Getting current cluster and saving it in temporary variable - prev_cluster = globals()["cpoint_" + str(cluster)] - # Sum of the cluster to calculate average difference between cluster center and data points - clustersum = 0 - item_count = 0 + changed = False + for cluster in range(0, clusters): + # Getting current cluster and saving it in temporary variable + prev_cluster = globals()["cpoint_" + str(cluster)] + # Sum of the cluster to calculate average difference between cluster center and data points + clustersum = 0 + item_count = 0 - for item in range(0, len(data[0])): - if data[1][item] == globals()["cpoint_" + str(cluster)]: - clustersum = clustersum + int(data[0][item]) - item_count = item_count + 1 - globals()["cpoint_" + str(cluster)] = round(clustersum / item_count) + for item in range(0, len(data[0])): + if data[1][item] == globals()["cpoint_" + str(cluster)]: + clustersum = clustersum + int(data[0][item]) + item_count = item_count + 1 + globals()["cpoint_" + str(cluster)] = round(clustersum / item_count) - # Checking if previous clusterpoint is equal to the one just calculated - if prev_cluster == globals()["cpoint_" + str(cluster)]: - changed = 1 + # Checking if previous clusterpoint is equal to the one just calculated + if prev_cluster == globals()["cpoint_" + str(cluster)]: + changed = True + + return changed - return changed def assignCluster(data, highPoint, clusters): - # Create a new data array for working - new_data = [] - new_data.append(data) + # Create a new data array for working + new_data = [] + new_data.append(data) - # Create new array for assigned clusters of each value - data_assigned = [] + # Create new array for assigned clusters of each value + data_assigned = [] - # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) - for item in range(0, len(new_data[0])): - # Set the minimal cluster difference to the highest difference in the list to ease comparision - min_cluster = highPoint + # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) + for item in range(0, len(new_data[0])): + # Set the minimal cluster difference to the highest difference in the list to ease comparision + min_cluster = highPoint - # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference - for cluster in range(0, clusters): - if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]): - min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]) - assinged_cluster = globals()["cpoint_" + str(cluster)] - # Assign the minimal difference cluster to the data - data_assigned.append(assinged_cluster) - # Add the assigned values list to the new_data array - new_data.append(data_assigned) + # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference + for cluster in range(0, clusters): + if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]): + min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]) + assinged_cluster = globals()["cpoint_" + str(cluster)] + # Assign the minimal difference cluster to the data + data_assigned.append(assinged_cluster) + # Add the assigned values list to the new_data array + new_data.append(data_assigned) + + return new_data - return new_data # Startup function for collecting necesarry data def startup(data): - # Using two clusters for testing - clusters = int(input("How many clusters are known? ")) - # cores = input("How many cores should be used? ") - # path = input("Where is the data? ") or in this case data - - # For benchmarking starting the timer now - start_time = time.time() + # Using two clusters for testing + clusters = int(input("How many clusters are known? ")) + # cores = input("How many cores should be used? ") + # path = input("Where is the data? ") or in this case data - # Firing up the engines! - kmeansmk1(data, clusters) + # For benchmarking starting the timer now + start_time = time.time() + + # Firing up the engines! + kmeansmk1(data, clusters) + + # Stopping benchmark + seconds = time.time() - start_time + print(str(seconds) + " seconds for execution") - # Stopping benchmark - seconds = time.time() - start_time - print(str(seconds) + " seconds for execution") # Start the algorithm and generate test data -data = dmtest.numGen(10000, 2, 5, True) +data = dmtest.numGen(10000, 10, 5, True) startup(data) diff --git a/src/algorithms/kmeansMkI_2d.py b/src/algorithms/kmeansMkI_2d.py index 83de3ee..ae57e43 100644 --- a/src/algorithms/kmeansMkI_2d.py +++ b/src/algorithms/kmeansMkI_2d.py @@ -1,14 +1,14 @@ #!/usr/bin/env python -# title: kmeansMkI.py -# description: Our personal Python K-Means++ implementation -# author: Tillmann Brendel, Conrad Großer -# license: Pending -# date: 04.06.2018 -# version: 1.5 -# usage: python pyscript.py +# title: kmeansMkI.py +# description: Our personal Python K-Means++ implementation +# author: Tillmann Brendel, Conrad Großer +# license: Pending +# date: 04.06.2018 +# version: 1.6 +# usage: python pyscript.py # notes: # known_issues: -# python_version: 3.x +# python_version: 3.x # ============================================================================== # IMPORTS @@ -31,79 +31,81 @@ import matplotlib.pyplot as plt import dmlib import dmtest -# CODE + # Main function of the algorithm def kmeansmk1(xdata, ydata, clusters): # Defining cluster points for i in range(0, clusters): globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]] print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) - #get max data in the data arrays - highpointx = dmlib.findHighest(xdata) - highpointy = dmlib.findHighest(ydata) - #print('highpoinx: ' + str(highpointx)) - #print('highpointy: ' + str(highpointy)) + + # Get the maximum of the data + highpointx = max(xdata) + highpointy = max(ydata) # Define variables for running the algorithm (runs is just as important as every other variable) - done = 0 + done = False runs = 0 - # As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters - while done == 0: - runs = runs + 1 + # As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters + while not done: + runs += 1 assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy) - #assigned_points consists of the clusternumbers + # assigned_points consists of the clusternumbers done = calcClusters(xdata, ydata, assigned_points, clusters) for i in range(0, clusters): print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") for i in range(0, clusters): plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro') + plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k') plt.show() + # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) def calcClusters(xdata, ydata, assigned_points, clusters): for cluster in range(0, clusters): - cpointunchanged = 1 + cpointunchanged = True globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)] clustersumx = 0 clustersumy = 0 count = 0 - #print('calcclusters running') + for item in range(0, len(xdata)): if assigned_points[item] == cluster: clustersumx = clustersumx + int(xdata[item]) clustersumy = clustersumy + int(ydata[item]) count = count + 1 - # print('item ' + str(item) +'done') + globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)] - #print('cluster ' + str(cluster) + 'done') + # checking if old clusterpoint is equal to the one just calculated if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]: - cpointunchanged = 0 + cpointunchanged = False return cpointunchanged + def assignCluster(xdata, ydata, clusters, highpointx, highpointy): data_assigned = [] assigned_cluster = 0 - resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy]) - #print('resetdist =' + str(resetdist)) + resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy]) + for item in range(0, len(xdata)): olddistance = resetdist for cluster in range(0, clusters): distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]]) - # print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance)) + if distance < olddistance: olddistance = distance assigned_cluster = cluster - # print('cluster number ' + str(cluster) + ' assigned') + data_assigned.append(assigned_cluster) - # Add the assigned values list to the new_data array - # new_data.append(data_assigned) + return data_assigned + # Startup function for collecting necesarry xdata def startup(xdata, ydata): # Using two clusters for testing @@ -121,6 +123,7 @@ def startup(xdata, ydata): seconds = time.time() - start_time print(str(seconds) + " seconds for execution") + # Start the algorithm and generate test xdata xdata = dmtest.numGenLight(10000, False, 5) ydata = dmtest.numGenLight(10000, False, 2)