From e2ad63f90f5e58b039de820c7b75c8d22297a2c6 Mon Sep 17 00:00:00 2001
From: Conrad <grosserconrad@gmail.com>
Date: Thu, 25 Apr 2019 12:43:37 +0200
Subject: [PATCH] Code Rework

---
 src/algorithms/dmlib.py           |  53 ++++-----
 src/algorithms/dmtest.py          |  48 ++++----
 src/algorithms/kmeansMkI.py       | 192 +++++++++++++++---------------
 src/data_generators/randomi.py    | 102 +++++++++-------
 src/data_generators/randomi2.1.py | 137 +++++++++++----------
 5 files changed, 273 insertions(+), 259 deletions(-)

diff --git a/src/algorithms/dmlib.py b/src/algorithms/dmlib.py
index 8d7203e..b4d3876 100644
--- a/src/algorithms/dmlib.py
+++ b/src/algorithms/dmlib.py
@@ -1,40 +1,27 @@
 # Calculate the difference between two points giving the indexes of these data entries
 def calcdiff(point1, point2):
-	if int(point2) > int(point1):
-		difference = int(point2) - int(point1)
-	else:
-		difference = int(point1) - int(point2)
-	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
-	return betrag(difference)
+    if int(point2) > int(point1):
+        difference = int(point2) - int(point1)
+    else:
+        difference = int(point1) - int(point2)
+    return abs(difference)
 
-# Get the absolute value of a number and returns it as int
-def betrag(number):
-	if number < 0:
-		number = int((-2 * number) / 2)
-	return number
-
-# Determine the highest int value in an array and returns is as an int
-def findHighest(data):
-	maximum = 0
-	for i in range(0, len(data)):
-		if int(data[i]) > maximum:
-			maximum = int(data[i])
-	return maximum
 
 def pp_calcdiff(data, clusterpoint):
-	max_diff = 0
-	new_cluster = 0
-	for item in range(0,len(data)):
-		if calcdiff(data[item], clusterpoint) > max_diff:
-			max_diff = calcdiff(data[item], clusterpoint)
-			new_cluster = data[item]
-	return new_cluster
+    max_diff = 0
+    new_cluster = 0
+    for item in range(0, len(data)):
+        if calcdiff(data[item], clusterpoint) > max_diff:
+            max_diff = calcdiff(data[item], clusterpoint)
+            new_cluster = data[item]
+    return new_cluster
+
 
 def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
-	max_diff = 0
-	new_cluster = 0
-	for item in range(0,len(data)):
-		if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
-			max_diff = calcdiff(data[item], clusterpoint)
-			new_cluster = data[item]
-	return new_cluster
\ No newline at end of file
+    max_diff = 0
+    new_cluster = 0
+    for item in range(0, len(data)):
+        if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
+            max_diff = calcdiff(data[item], clusterpoint)
+            new_cluster = data[item]
+    return new_cluster
diff --git a/src/algorithms/dmtest.py b/src/algorithms/dmtest.py
index d13dc48..ff98075 100644
--- a/src/algorithms/dmtest.py
+++ b/src/algorithms/dmtest.py
@@ -1,34 +1,38 @@
 # For random generation of numbers import randint
 from random import randint, shuffle
 
+
 # Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
 def plzGen(entries):
-	dataArray = []
-	plz_lenght = 5
-	for i in range(0, int(entries)):
-		if i < round(entries * 0.4):
-			plz = generateNumber(plz_lenght, 2)
-		elif i >= round(entries * 0.4) and i < round(entries * 0.6):
-			plz = generateNumber(plz_lenght, 9)
-		elif i >= round(entries * 0.6) and i < round(entries * 0.9):
-			plz = generateNumber(plz_lenght, 4)
-		else:
-			plz = generateNumber(plz_lenght, randint(0,9))
-		dataArray.append(plz)
-	shuffle(dataArray)
-	return dataArray
+    dataArray = []
+    plz_lenght = 5
+    for i in range(0, int(entries)):
+        if i < round(entries * 0.4):
+            plz = generateNumber(plz_lenght, 2)
+        elif i >= round(entries * 0.4) and i < round(entries * 0.6):
+            plz = generateNumber(plz_lenght, 9)
+        elif i >= round(entries * 0.6) and i < round(entries * 0.9):
+            plz = generateNumber(plz_lenght, 4)
+        else:
+            plz = generateNumber(plz_lenght, randint(0, 9))
+        dataArray.append(plz)
+    shuffle(dataArray)
+    return dataArray
+
 
 # Function for generating the content of one single row randomly
 def generateNumber(numberLenght, startingNumber):
-	number = str(startingNumber)
-	for length in range(0, numberLenght - 1):
-		number = number + str(randint(0,9))
-	return number
+    number = str(startingNumber)
+    for length in range(0, numberLenght - 1):
+        number = number + str(randint(0, 9))
+    return number
+
 
 # Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
 # /testdata/ folder has to be created at this point
 def writeFile(content, nameChunkStart, namePartStart):
-	filenumber = int(nameChunkStart) + int(namePartStart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
\ No newline at end of file
+    filenumber = int(nameChunkStart) + int(namePartStart)
+    file = open("testdata/file" + str(filenumber) + ".txt", "w")
+    for w in range(0, len(content)):
+        file.write(content[w] + "\n")
+
diff --git a/src/algorithms/kmeansMkI.py b/src/algorithms/kmeansMkI.py
index 44eb3e1..48923f4 100644
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -1,19 +1,18 @@
 #!/usr/bin/env python
-#title:				kmeansMkI.py
-#description:		Our personal Python K-Means++ implementation
-#author:			Tillmann Brendel, Conrad Großer
-#license:			Pending
-#date:				26.05.2018
-#version:			1.2
-#usage:				python pyscript.py
-#notes:				
-#dependencies:		mathplotlib
-#known_issues:		
-#python_version:	3.x
-#==============================================================================
+# title:            kmeansMkI.py
+# description:      Our personal Python K-Means++ implementation
+# author:           Tillmann Brendel, Conrad Großer
+# license:          Pending
+# date:             26.05.2018
+# version:          1.2
+# usage:            python pyscript.py
+# notes:
+# dependencies:     mathplotlib
+# known_issues:
+# python_version:   3.x
+# ==============================================================================
 
 # IMPORTS
-
 # Importing the time for benchmarking purposes
 import time
 from datetime import date
@@ -28,117 +27,120 @@ import matplotlib.pyplot as plt
 import dmlib
 import dmtest
 
-# CODE
+
 # Main function of the algorithm
 def kmeansmk1(data, clusters):
-	globals()["cpoint_0"] = data[randint(0, len(data))]
-	globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
+    globals()["cpoint_0"] = data[randint(0, len(data))]
+    globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
 
-	print("Initial cluster 1: " + str(globals()["cpoint_0"]))
-	print("Initial cluster 2: " + str(globals()["cpoint_1"]))
+    print("Initial cluster 1: " + str(globals()["cpoint_0"]))
+    print("Initial cluster 2: " + str(globals()["cpoint_1"]))
 
-	# Defining cluster points
-	for i in range(2, clusters):
-		globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
-		print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
+    # Defining cluster points
+    for i in range(2, clusters):
+        globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
+        print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
 
-	# Get max value in the data array
-	highPoint = dmlib.findHighest(data)
+    # Get max value in the data array
+    highPoint = max(data)
 
-	# Define variables for running the algorithm (runs is just for benchmarking!)
-	done = 0
-	runs = 0
+    # Define variables for running the algorithm (runs is just for benchmarking!)
+    done, runs = False, 0
 
-	# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
-	while done == 0:
-		runs = runs + 1
-		new_data = assignCluster(data, highPoint, clusters)
-		done = calcClusters(new_data, clusters)
+    # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
+    while not done:
+        runs += 1
+        new_data = assignCluster(data, highPoint, clusters)
+        done = calcClusters(new_data, clusters)
 
-	# Printing final clusters
-	for i in range(0, clusters):
-		print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
+    # Printing final clusters
+    for i in range(0, clusters):
+        print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
 
-	# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
-	anew = []
-	inew = 0
-	while inew < len(data):
-		anew.append(inew)
-		inew = inew + 1
+    # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
+    anew, inew = [], 0
 
-	# Drawing found clusters as lines
-	for i in range(0, clusters):
-		plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
+    while inew < len(data):
+        anew.append(inew)
+        inew += 1
 
-	# Showing graph
-	plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
-	plt.show()
+    # Drawing found clusters as lines
+    for i in range(0, clusters):
+        plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
+
+    # Showing graph
+    plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
+    plt.show()
+
+    return 0
 
-	return 0
 
 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
 def calcClusters(data, clusters):
-	changed = 0
-	for cluster in range(0, clusters):
-		# Getting current cluster and saving it in temporary variable
-		prev_cluster = globals()["cpoint_" + str(cluster)]
-		# Sum of the cluster to calculate average difference between cluster center and data points 
-		clustersum = 0
-		item_count = 0
+    changed = False
+    for cluster in range(0, clusters):
+        # Getting current cluster and saving it in temporary variable
+        prev_cluster = globals()["cpoint_" + str(cluster)]
+        # Sum of the cluster to calculate average difference between cluster center and data points
+        clustersum = 0
+        item_count = 0
 
-		for item in range(0, len(data[0])):
-			if data[1][item] == globals()["cpoint_" + str(cluster)]:
-				clustersum = clustersum + int(data[0][item])
-				item_count = item_count + 1
-		globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
+        for item in range(0, len(data[0])):
+            if data[1][item] == globals()["cpoint_" + str(cluster)]:
+                clustersum = clustersum + int(data[0][item])
+                item_count = item_count + 1
+        globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
 
-		# Checking if previous clusterpoint is equal to the one just calculated
-		if prev_cluster == globals()["cpoint_" + str(cluster)]:
-			changed = 1
+        # Checking if previous clusterpoint is equal to the one just calculated
+        if prev_cluster == globals()["cpoint_" + str(cluster)]:
+            changed = True
+
+    return changed
 
-	return changed
 
 def assignCluster(data, highPoint, clusters):
-	# Create a new data array for working
-	new_data = []
-	new_data.append(data)
+    # Create a new data array for working
+    new_data = [data]
 
-	# Create new array for assigned clusters of each value
-	data_assigned = []
+    # Create new array for assigned clusters of each value
+    data_assigned = []
 
-	# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
-	for item in range(0, len(new_data[0])):
-		# Set the minimal cluster difference to the highest difference in the list to ease comparision
-		min_cluster = highPoint
+    # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
+    for item in data:
+        # Set the minimal cluster difference to the highest difference in the list to ease comparision
+        min_cluster = highPoint
 
-		# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference 
-		for cluster in range(0, clusters):
-			if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
-				min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
-				assinged_cluster = globals()["cpoint_" + str(cluster)]
-		# Assign the minimal difference cluster to the data
-		data_assigned.append(assinged_cluster)
-	# Add the assigned values list to the new_data array
-	new_data.append(data_assigned)
+        # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
+        for cluster in range(0, clusters):
+            if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]):
+                min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)])
+                assinged_cluster = globals()["cpoint_" + str(cluster)]
+
+        # Assign the minimal difference cluster to the data
+        data_assigned.append(assinged_cluster)
+    # Add the assigned values list to the new_data array
+    new_data.append(data_assigned)
+
+    return new_data
 
-	return new_data
 
 # Startup function for collecting necesarry data
 def startup(data):
-	# Using two clusters for testing
-	clusters = int(input("How many clusters are known? "))
-	# cores = input("How many cores should be used? ")
-	# path = input("Where is the data? ") or in this case data
-	
-	# For benchmarking starting the timer now
-	start_time = time.time()
+    # Using two clusters for testing
+    clusters = int(input("How many clusters are known? "))
+    # cores = input("How many cores should be used? ")
+    # path = input("Where is the data? ") or in this case data
 
-	# Firing up the engines!
-	kmeansmk1(data, clusters)
+    # For benchmarking starting the timer now
+    start_time = time.time()
+
+    # Firing up the engines!
+    kmeansmk1(data, clusters)
+
+    # Stopping benchmark
+    seconds = time.time() - start_time
+    print(str(seconds) + " seconds for execution")
 
-	# Stopping benchmark
-	seconds = time.time() - start_time
-	print(str(seconds) + " seconds for execution")
 
 # Start the algorithm and generate test data
 data = dmtest.plzGen(10000)
diff --git a/src/data_generators/randomi.py b/src/data_generators/randomi.py
index b6d0981..8840c60 100644
--- a/src/data_generators/randomi.py
+++ b/src/data_generators/randomi.py
@@ -10,69 +10,79 @@ from datetime import date
 # Importing for multi core processing
 import multiprocessing
 
+
 # randomI function which creates each file
 def randomI(units, rows, rowLength, partstart):
-	for setcounter in range(0, units):
-		writeFile(generateFile(rows, rowLength), setcounter, partstart)
+    for setcounter in range(0, units):
+        writeFile(generateFile(rows, rowLength), setcounter, partstart)
+    return True
+
 
 # Function for generating the content of one single file
 def generateFile(rows, rowLength):
-	content = []
-	for y in range(0, rows):
-		content.append(generateRow(rowLength))
-	return content
+    content = []
+    for entry in rows:
+        content.append(generateRow(rowLength))
+    return content
+
 
 # Function for generating the content of one single row randomly
 def generateRow(rowLength):
-	row = ""
-	for z in range(0, rowLength):
-		row = row + str(randint(0, 9))
-	return row
+    row = ""
+    for z in range(0, rowLength):
+        row += str(randint(0, 9))
+    return row
+
 
 # Function for writing data into a file
 def writeFile(content, setcounter, partstart):
-	filenumber = int(setcounter) + int(partstart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
+    filenumber = int(setcounter) + int(partstart)
+    file = open("testdata/file" + str(filenumber) + ".txt", "w")
+    for line in content:
+        file.write(line + "\n")
+    return True
+
 
 if __name__ == '__main__':
-	# Getting the user input
-	print("Hello World")
-	units = int(input("How many units would you like to generate? "))
-	rows = int(input("How many rows should each unit have? "))
-	rowLength = int(input("How long should each row be? "))
-	cores = int(input("How many cores do you want to use? "))
+    # Getting the user input
+    print("Hello World")
+    units = int(input("How many units would you like to generate? "))
+    rows = int(input("How many rows should each unit have? "))
+    rowLength = int(input("How long should each row be? "))
+    cores = int(input("How many cores do you want to use? "))
 
-	# Splitting up the units
-	count = int(0)
-	partsize = units / cores
+    # Splitting up the units
+    count = 0
+    partsize = units / cores
 
-	# For benchmarking starting the timer now
-	start_time = time.time()
+    # For benchmarking starting the timer now
+    start_time = time.time()
 
-	# Initialize and prepare cores for process
-	while count < cores:
-		partstart = partsize * count
-		globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart))
-		count = count + 1
+    # Initialize and prepare cores for process
+    while count < cores:
+        partstart = partsize * count
+        globals()["p" + str(count)] = multiprocessing.Process(
+            target=randomI,
+            args=(int(partsize), rows, rowLength, partstart)
+            )
+        count += 1
 
-	# Starting each core
-	count = int(0)
-	while count < cores:
-		globals()["p" + str(count)].start()
-		print("Core " + str(count) + " started.")
-		count = count + 1
+    # Starting each core
+    count = 0
+    while count < cores:
+        globals()["p" + str(count)].start()
+        print("Core " + str(count) + " started.")
+        count += 1
 
-	print("Working...")
+    print("Working...")
 
-	# Joining each core for the process
-	count = int(0)
-	while count < cores:
-		globals()["p" + str(count)].join()
-		count = count + 1
+    # Joining each core for the process
+    count = 0
+    while count < cores:
+        globals()["p" + str(count)].join()
+        count += 1
 
-	# Finishing up the process
-	sec = time.time() - start_time
-	print("Data is generated. Have fun!")	
-	print("randomI took " + str(sec) + " seconds for execution.")
+    # Finishing up the process
+    sec = time.time() - start_time
+    print("Data is generated. Have fun!")
+    print("randomI took " + str(sec) + " seconds for execution.")
diff --git a/src/data_generators/randomi2.1.py b/src/data_generators/randomi2.1.py
index b035f58..1c68afe 100644
--- a/src/data_generators/randomi2.1.py
+++ b/src/data_generators/randomi2.1.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python
-#title:				randomI2.1.py
-#description:		Personal 
-#author:			Tillmann Brendel, Conrad Großer
-#license:			Pending
-#date:				26.05.2018
-#version:			1.0
-#usage:				python pyscript.py
-#notes:
-#known_issues:
-#python_version:	3.x
-#==============================================================================
+# title:              randomI2.1.py
+# description:        Personal
+# author:             Tillmann Brendel, Conrad Großer
+# license:            Pending
+# date:               26.05.2018
+# version:            1.0
+# usage:              python pyscript.py
+# notes:
+# known_issues:
+# python_version:    3.x
+# ==============================================================================
 
 # For random generation of numbers import randint
 from random import randint
@@ -21,76 +21,87 @@ from datetime import date
 # Importing for multi core processing
 import multiprocessing
 
+
 # randomI function which creates each file
 def randomI(units, rows, rowLength, partstart, cluster):
-	for setcounter in range(0, units):
-		writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
+    for setcounter in range(0, units):
+        writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
+    return True
+
 
 # Function for generating the content of one single file
 def generateFile(rows, rowLength, cluster):
-	content = []
-	for y in range(0, rows):
-		if y == 0:
-			if 1 == randint(1, cluster):
-				content.append(generate09())
-			else:
-				content.append(generatePLZ())
-		else:
-			content.append(generateRow(rowLength))
-	return content
+    content = []
+    for entry in rows:
+        if entry == 0:
+            if randint(1, cluster) == 1:
+                content.append(generate09())
+            else:
+                content.append(generatePLZ())
+        else:
+            content.append(generateRow(rowLength))
+    return content
+
 
 # Function for generating the content of one single row randomly
 def generateRow(rowLength):
-	row = ""
-	for z in range(0, rowLength):
-		row = row + str(randint(0, 9))
-	return row
+    row = ''
+    for z in range(0, rowLength):
+        row += str(randint(0, 9))
+    return row
+
 
 # Function for writing data into a file (content = string, setcount and partstart are for better naming)
 def writeFile(content, setcounter, partstart):
-	filenumber = int(setcounter) + int(partstart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
+    filenumber = int(setcounter) + int(partstart)
+    file = open('testdata/file' + str(filenumber) + '.txt', 'w')
+
+    for line in content:
+        file.write(line + '\n')
+    return True
+
 
 if __name__ == '__main__':
-	# Getting the user input
-	print("Hello World")
-	units = int(input("How many units would you like to generate? "))
-	rows = int(input("How many rows should each unit have? "))
-	rowLength = int(input("How long should each row be? "))
-	cores = int(input("How many cores do you want to use? "))
-	cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/"))
+    # Getting the user input
+    print('Hello World')
+    units = int(input('How many units would you like to generate? '))
+    rows = int(input('How many rows should each unit have? '))
+    rowLength = int(input('How long should each row be? '))
+    cores = int(input('How many cores do you want to use? '))
+    cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/'))
 
     # Splitting up the units
-	count = int(0)
-	partsize = units / cores
+    count = 0
+    partsize = units / cores
 
-	# For benchmarking starting the timer now
-	start_time = time.time()
+    # For benchmarking starting the timer now
+    start_time = time.time()
 
-	# Initialize and prepare cores for process
-	while count < cores:
-		partstart = partsize * count
-		globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster))
-		count = count + 1
+    # Initialize and prepare cores for process
+    while count < cores:
+        partstart = partsize * count
+        globals()['p' + str(count)] = multiprocessing.Process(
+            target=randomI,
+            args=(int(partsize), rows, rowLength, partstart, cluster)
+            )
+        count += 1
 
-	# Starting each core
-	count = int(0)
-	while count < cores:
-		globals()["p" + str(count)].start()
-		print("Core " + str(count) + " started.")
-		count = count + 1
+    # Starting each core
+    count = int(0)
+    while count < cores:
+        globals()['p' + str(count)].start()
+        print('Core ' + str(count) + ' started.')
+        count += 1
 
-	print("Working...")
+    print('Working...')
 
-	# Joining each core for the process
-	count = int(0)
-	while count < cores:
-		globals()["p" + str(count)].join()
-		count = count + 1
+    # Joining each core for the process
+    count = 0
+    while count < cores:
+        globals()['p' + str(count)].join()
+        count += 1
 
-	# Finishing up the process
-	sec = time.time() - start_time
-	print("Data is generated. Have fun!")
-	print("randomI took " + str(sec) + " seconds for execution.")
+    # Finishing up the process
+    sec = time.time() - start_time
+    print('Data is generated. Have fun!')
+    print('randomI took ' + str(sec) + ' seconds for execution.')