From e2ad63f90f5e58b039de820c7b75c8d22297a2c6 Mon Sep 17 00:00:00 2001 From: Conrad Date: Thu, 25 Apr 2019 12:43:37 +0200 Subject: [PATCH] Code Rework --- src/algorithms/dmlib.py | 53 ++++----- src/algorithms/dmtest.py | 48 ++++---- src/algorithms/kmeansMkI.py | 192 +++++++++++++++--------------- src/data_generators/randomi.py | 102 +++++++++------- src/data_generators/randomi2.1.py | 137 +++++++++++---------- 5 files changed, 273 insertions(+), 259 deletions(-) diff --git a/src/algorithms/dmlib.py b/src/algorithms/dmlib.py index 8d7203e..b4d3876 100644 --- a/src/algorithms/dmlib.py +++ b/src/algorithms/dmlib.py @@ -1,40 +1,27 @@ # Calculate the difference between two points giving the indexes of these data entries def calcdiff(point1, point2): - if int(point2) > int(point1): - difference = int(point2) - int(point1) - else: - difference = int(point1) - int(point2) - # print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) - return betrag(difference) + if int(point2) > int(point1): + difference = int(point2) - int(point1) + else: + difference = int(point1) - int(point2) + return abs(difference) -# Get the absolute value of a number and returns it as int -def betrag(number): - if number < 0: - number = int((-2 * number) / 2) - return number - -# Determine the highest int value in an array and returns is as an int -def findHighest(data): - maximum = 0 - for i in range(0, len(data)): - if int(data[i]) > maximum: - maximum = int(data[i]) - return maximum def pp_calcdiff(data, clusterpoint): - max_diff = 0 - new_cluster = 0 - for item in range(0,len(data)): - if calcdiff(data[item], clusterpoint) > max_diff: - max_diff = calcdiff(data[item], clusterpoint) - new_cluster = data[item] - return new_cluster + max_diff = 0 + new_cluster = 0 + for item in range(0, len(data)): + if calcdiff(data[item], clusterpoint) > max_diff: + max_diff = calcdiff(data[item], clusterpoint) + new_cluster = data[item] + return new_cluster + def pp_calcdiff_2(data, clusterpoint, clusterpoint_2): - max_diff = 0 - new_cluster = 0 - for item in range(0,len(data)): - if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff: - max_diff = calcdiff(data[item], clusterpoint) - new_cluster = data[item] - return new_cluster \ No newline at end of file + max_diff = 0 + new_cluster = 0 + for item in range(0, len(data)): + if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff: + max_diff = calcdiff(data[item], clusterpoint) + new_cluster = data[item] + return new_cluster diff --git a/src/algorithms/dmtest.py b/src/algorithms/dmtest.py index d13dc48..ff98075 100644 --- a/src/algorithms/dmtest.py +++ b/src/algorithms/dmtest.py @@ -1,34 +1,38 @@ # For random generation of numbers import randint from random import randint, shuffle + # Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs def plzGen(entries): - dataArray = [] - plz_lenght = 5 - for i in range(0, int(entries)): - if i < round(entries * 0.4): - plz = generateNumber(plz_lenght, 2) - elif i >= round(entries * 0.4) and i < round(entries * 0.6): - plz = generateNumber(plz_lenght, 9) - elif i >= round(entries * 0.6) and i < round(entries * 0.9): - plz = generateNumber(plz_lenght, 4) - else: - plz = generateNumber(plz_lenght, randint(0,9)) - dataArray.append(plz) - shuffle(dataArray) - return dataArray + dataArray = [] + plz_lenght = 5 + for i in range(0, int(entries)): + if i < round(entries * 0.4): + plz = generateNumber(plz_lenght, 2) + elif i >= round(entries * 0.4) and i < round(entries * 0.6): + plz = generateNumber(plz_lenght, 9) + elif i >= round(entries * 0.6) and i < round(entries * 0.9): + plz = generateNumber(plz_lenght, 4) + else: + plz = generateNumber(plz_lenght, randint(0, 9)) + dataArray.append(plz) + shuffle(dataArray) + return dataArray + # Function for generating the content of one single row randomly def generateNumber(numberLenght, startingNumber): - number = str(startingNumber) - for length in range(0, numberLenght - 1): - number = number + str(randint(0,9)) - return number + number = str(startingNumber) + for length in range(0, numberLenght - 1): + number = number + str(randint(0, 9)) + return number + # Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming) # /testdata/ folder has to be created at this point def writeFile(content, nameChunkStart, namePartStart): - filenumber = int(nameChunkStart) + int(namePartStart) - file = open("testdata/file" + str(filenumber) + ".txt", "w") - for w in range(0, len(content)): - file.write(content[w] + "\n") \ No newline at end of file + filenumber = int(nameChunkStart) + int(namePartStart) + file = open("testdata/file" + str(filenumber) + ".txt", "w") + for w in range(0, len(content)): + file.write(content[w] + "\n") + diff --git a/src/algorithms/kmeansMkI.py b/src/algorithms/kmeansMkI.py index 44eb3e1..48923f4 100644 --- a/src/algorithms/kmeansMkI.py +++ b/src/algorithms/kmeansMkI.py @@ -1,19 +1,18 @@ #!/usr/bin/env python -#title: kmeansMkI.py -#description: Our personal Python K-Means++ implementation -#author: Tillmann Brendel, Conrad Großer -#license: Pending -#date: 26.05.2018 -#version: 1.2 -#usage: python pyscript.py -#notes: -#dependencies: mathplotlib -#known_issues: -#python_version: 3.x -#============================================================================== +# title: kmeansMkI.py +# description: Our personal Python K-Means++ implementation +# author: Tillmann Brendel, Conrad Großer +# license: Pending +# date: 26.05.2018 +# version: 1.2 +# usage: python pyscript.py +# notes: +# dependencies: mathplotlib +# known_issues: +# python_version: 3.x +# ============================================================================== # IMPORTS - # Importing the time for benchmarking purposes import time from datetime import date @@ -28,117 +27,120 @@ import matplotlib.pyplot as plt import dmlib import dmtest -# CODE + # Main function of the algorithm def kmeansmk1(data, clusters): - globals()["cpoint_0"] = data[randint(0, len(data))] - globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"]) + globals()["cpoint_0"] = data[randint(0, len(data))] + globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"]) - print("Initial cluster 1: " + str(globals()["cpoint_0"])) - print("Initial cluster 2: " + str(globals()["cpoint_1"])) + print("Initial cluster 1: " + str(globals()["cpoint_0"])) + print("Initial cluster 2: " + str(globals()["cpoint_1"])) - # Defining cluster points - for i in range(2, clusters): - globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)]) - print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) + # Defining cluster points + for i in range(2, clusters): + globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)]) + print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) - # Get max value in the data array - highPoint = dmlib.findHighest(data) + # Get max value in the data array + highPoint = max(data) - # Define variables for running the algorithm (runs is just for benchmarking!) - done = 0 - runs = 0 + # Define variables for running the algorithm (runs is just for benchmarking!) + done, runs = False, 0 - # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters - while done == 0: - runs = runs + 1 - new_data = assignCluster(data, highPoint, clusters) - done = calcClusters(new_data, clusters) + # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters + while not done: + runs += 1 + new_data = assignCluster(data, highPoint, clusters) + done = calcClusters(new_data, clusters) - # Printing final clusters - for i in range(0, clusters): - print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") + # Printing final clusters + for i in range(0, clusters): + print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") - # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data - anew = [] - inew = 0 - while inew < len(data): - anew.append(inew) - inew = inew + 1 + # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data + anew, inew = [], 0 - # Drawing found clusters as lines - for i in range(0, clusters): - plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r') + while inew < len(data): + anew.append(inew) + inew += 1 - # Showing graph - plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k') - plt.show() + # Drawing found clusters as lines + for i in range(0, clusters): + plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r') + + # Showing graph + plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k') + plt.show() + + return 0 - return 0 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) def calcClusters(data, clusters): - changed = 0 - for cluster in range(0, clusters): - # Getting current cluster and saving it in temporary variable - prev_cluster = globals()["cpoint_" + str(cluster)] - # Sum of the cluster to calculate average difference between cluster center and data points - clustersum = 0 - item_count = 0 + changed = False + for cluster in range(0, clusters): + # Getting current cluster and saving it in temporary variable + prev_cluster = globals()["cpoint_" + str(cluster)] + # Sum of the cluster to calculate average difference between cluster center and data points + clustersum = 0 + item_count = 0 - for item in range(0, len(data[0])): - if data[1][item] == globals()["cpoint_" + str(cluster)]: - clustersum = clustersum + int(data[0][item]) - item_count = item_count + 1 - globals()["cpoint_" + str(cluster)] = round(clustersum / item_count) + for item in range(0, len(data[0])): + if data[1][item] == globals()["cpoint_" + str(cluster)]: + clustersum = clustersum + int(data[0][item]) + item_count = item_count + 1 + globals()["cpoint_" + str(cluster)] = round(clustersum / item_count) - # Checking if previous clusterpoint is equal to the one just calculated - if prev_cluster == globals()["cpoint_" + str(cluster)]: - changed = 1 + # Checking if previous clusterpoint is equal to the one just calculated + if prev_cluster == globals()["cpoint_" + str(cluster)]: + changed = True + + return changed - return changed def assignCluster(data, highPoint, clusters): - # Create a new data array for working - new_data = [] - new_data.append(data) + # Create a new data array for working + new_data = [data] - # Create new array for assigned clusters of each value - data_assigned = [] + # Create new array for assigned clusters of each value + data_assigned = [] - # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) - for item in range(0, len(new_data[0])): - # Set the minimal cluster difference to the highest difference in the list to ease comparision - min_cluster = highPoint + # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) + for item in data: + # Set the minimal cluster difference to the highest difference in the list to ease comparision + min_cluster = highPoint - # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference - for cluster in range(0, clusters): - if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]): - min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]) - assinged_cluster = globals()["cpoint_" + str(cluster)] - # Assign the minimal difference cluster to the data - data_assigned.append(assinged_cluster) - # Add the assigned values list to the new_data array - new_data.append(data_assigned) + # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference + for cluster in range(0, clusters): + if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]): + min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]) + assinged_cluster = globals()["cpoint_" + str(cluster)] + + # Assign the minimal difference cluster to the data + data_assigned.append(assinged_cluster) + # Add the assigned values list to the new_data array + new_data.append(data_assigned) + + return new_data - return new_data # Startup function for collecting necesarry data def startup(data): - # Using two clusters for testing - clusters = int(input("How many clusters are known? ")) - # cores = input("How many cores should be used? ") - # path = input("Where is the data? ") or in this case data - - # For benchmarking starting the timer now - start_time = time.time() + # Using two clusters for testing + clusters = int(input("How many clusters are known? ")) + # cores = input("How many cores should be used? ") + # path = input("Where is the data? ") or in this case data - # Firing up the engines! - kmeansmk1(data, clusters) + # For benchmarking starting the timer now + start_time = time.time() + + # Firing up the engines! + kmeansmk1(data, clusters) + + # Stopping benchmark + seconds = time.time() - start_time + print(str(seconds) + " seconds for execution") - # Stopping benchmark - seconds = time.time() - start_time - print(str(seconds) + " seconds for execution") # Start the algorithm and generate test data data = dmtest.plzGen(10000) diff --git a/src/data_generators/randomi.py b/src/data_generators/randomi.py index b6d0981..8840c60 100644 --- a/src/data_generators/randomi.py +++ b/src/data_generators/randomi.py @@ -10,69 +10,79 @@ from datetime import date # Importing for multi core processing import multiprocessing + # randomI function which creates each file def randomI(units, rows, rowLength, partstart): - for setcounter in range(0, units): - writeFile(generateFile(rows, rowLength), setcounter, partstart) + for setcounter in range(0, units): + writeFile(generateFile(rows, rowLength), setcounter, partstart) + return True + # Function for generating the content of one single file def generateFile(rows, rowLength): - content = [] - for y in range(0, rows): - content.append(generateRow(rowLength)) - return content + content = [] + for entry in rows: + content.append(generateRow(rowLength)) + return content + # Function for generating the content of one single row randomly def generateRow(rowLength): - row = "" - for z in range(0, rowLength): - row = row + str(randint(0, 9)) - return row + row = "" + for z in range(0, rowLength): + row += str(randint(0, 9)) + return row + # Function for writing data into a file def writeFile(content, setcounter, partstart): - filenumber = int(setcounter) + int(partstart) - file = open("testdata/file" + str(filenumber) + ".txt", "w") - for w in range(0, len(content)): - file.write(content[w] + "\n") + filenumber = int(setcounter) + int(partstart) + file = open("testdata/file" + str(filenumber) + ".txt", "w") + for line in content: + file.write(line + "\n") + return True + if __name__ == '__main__': - # Getting the user input - print("Hello World") - units = int(input("How many units would you like to generate? ")) - rows = int(input("How many rows should each unit have? ")) - rowLength = int(input("How long should each row be? ")) - cores = int(input("How many cores do you want to use? ")) + # Getting the user input + print("Hello World") + units = int(input("How many units would you like to generate? ")) + rows = int(input("How many rows should each unit have? ")) + rowLength = int(input("How long should each row be? ")) + cores = int(input("How many cores do you want to use? ")) - # Splitting up the units - count = int(0) - partsize = units / cores + # Splitting up the units + count = 0 + partsize = units / cores - # For benchmarking starting the timer now - start_time = time.time() + # For benchmarking starting the timer now + start_time = time.time() - # Initialize and prepare cores for process - while count < cores: - partstart = partsize * count - globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart)) - count = count + 1 + # Initialize and prepare cores for process + while count < cores: + partstart = partsize * count + globals()["p" + str(count)] = multiprocessing.Process( + target=randomI, + args=(int(partsize), rows, rowLength, partstart) + ) + count += 1 - # Starting each core - count = int(0) - while count < cores: - globals()["p" + str(count)].start() - print("Core " + str(count) + " started.") - count = count + 1 + # Starting each core + count = 0 + while count < cores: + globals()["p" + str(count)].start() + print("Core " + str(count) + " started.") + count += 1 - print("Working...") + print("Working...") - # Joining each core for the process - count = int(0) - while count < cores: - globals()["p" + str(count)].join() - count = count + 1 + # Joining each core for the process + count = 0 + while count < cores: + globals()["p" + str(count)].join() + count += 1 - # Finishing up the process - sec = time.time() - start_time - print("Data is generated. Have fun!") - print("randomI took " + str(sec) + " seconds for execution.") + # Finishing up the process + sec = time.time() - start_time + print("Data is generated. Have fun!") + print("randomI took " + str(sec) + " seconds for execution.") diff --git a/src/data_generators/randomi2.1.py b/src/data_generators/randomi2.1.py index b035f58..1c68afe 100644 --- a/src/data_generators/randomi2.1.py +++ b/src/data_generators/randomi2.1.py @@ -1,15 +1,15 @@ #!/usr/bin/env python -#title: randomI2.1.py -#description: Personal -#author: Tillmann Brendel, Conrad Großer -#license: Pending -#date: 26.05.2018 -#version: 1.0 -#usage: python pyscript.py -#notes: -#known_issues: -#python_version: 3.x -#============================================================================== +# title: randomI2.1.py +# description: Personal +# author: Tillmann Brendel, Conrad Großer +# license: Pending +# date: 26.05.2018 +# version: 1.0 +# usage: python pyscript.py +# notes: +# known_issues: +# python_version: 3.x +# ============================================================================== # For random generation of numbers import randint from random import randint @@ -21,76 +21,87 @@ from datetime import date # Importing for multi core processing import multiprocessing + # randomI function which creates each file def randomI(units, rows, rowLength, partstart, cluster): - for setcounter in range(0, units): - writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart) + for setcounter in range(0, units): + writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart) + return True + # Function for generating the content of one single file def generateFile(rows, rowLength, cluster): - content = [] - for y in range(0, rows): - if y == 0: - if 1 == randint(1, cluster): - content.append(generate09()) - else: - content.append(generatePLZ()) - else: - content.append(generateRow(rowLength)) - return content + content = [] + for entry in rows: + if entry == 0: + if randint(1, cluster) == 1: + content.append(generate09()) + else: + content.append(generatePLZ()) + else: + content.append(generateRow(rowLength)) + return content + # Function for generating the content of one single row randomly def generateRow(rowLength): - row = "" - for z in range(0, rowLength): - row = row + str(randint(0, 9)) - return row + row = '' + for z in range(0, rowLength): + row += str(randint(0, 9)) + return row + # Function for writing data into a file (content = string, setcount and partstart are for better naming) def writeFile(content, setcounter, partstart): - filenumber = int(setcounter) + int(partstart) - file = open("testdata/file" + str(filenumber) + ".txt", "w") - for w in range(0, len(content)): - file.write(content[w] + "\n") + filenumber = int(setcounter) + int(partstart) + file = open('testdata/file' + str(filenumber) + '.txt', 'w') + + for line in content: + file.write(line + '\n') + return True + if __name__ == '__main__': - # Getting the user input - print("Hello World") - units = int(input("How many units would you like to generate? ")) - rows = int(input("How many rows should each unit have? ")) - rowLength = int(input("How long should each row be? ")) - cores = int(input("How many cores do you want to use? ")) - cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/")) + # Getting the user input + print('Hello World') + units = int(input('How many units would you like to generate? ')) + rows = int(input('How many rows should each unit have? ')) + rowLength = int(input('How long should each row be? ')) + cores = int(input('How many cores do you want to use? ')) + cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/')) # Splitting up the units - count = int(0) - partsize = units / cores + count = 0 + partsize = units / cores - # For benchmarking starting the timer now - start_time = time.time() + # For benchmarking starting the timer now + start_time = time.time() - # Initialize and prepare cores for process - while count < cores: - partstart = partsize * count - globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster)) - count = count + 1 + # Initialize and prepare cores for process + while count < cores: + partstart = partsize * count + globals()['p' + str(count)] = multiprocessing.Process( + target=randomI, + args=(int(partsize), rows, rowLength, partstart, cluster) + ) + count += 1 - # Starting each core - count = int(0) - while count < cores: - globals()["p" + str(count)].start() - print("Core " + str(count) + " started.") - count = count + 1 + # Starting each core + count = int(0) + while count < cores: + globals()['p' + str(count)].start() + print('Core ' + str(count) + ' started.') + count += 1 - print("Working...") + print('Working...') - # Joining each core for the process - count = int(0) - while count < cores: - globals()["p" + str(count)].join() - count = count + 1 + # Joining each core for the process + count = 0 + while count < cores: + globals()['p' + str(count)].join() + count += 1 - # Finishing up the process - sec = time.time() - start_time - print("Data is generated. Have fun!") - print("randomI took " + str(sec) + " seconds for execution.") + # Finishing up the process + sec = time.time() - start_time + print('Data is generated. Have fun!') + print('randomI took ' + str(sec) + ' seconds for execution.')