Update .gitignore

Code Rework
kmeans ++ Update
2019-04-25 12:45:05 +02:00 · 2019-04-25 12:43:37 +02:00 · 2018-06-02 23:31:43 +02:00
6 changed files with 276 additions and 255 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 testdata/
 __pycache__/
 .DS_Store
--- a/src/algorithms/dmlib.py
+++ b/src/algorithms/dmlib.py
@@ -1,40 +1,27 @@
 # Calculate the difference between two points giving the indexes of these data entries
 def calcdiff(point1, point2):
-	if int(point2) > int(point1):
+    if int(point2) > int(point1):
-		difference = int(point2) - int(point1)
+        difference = int(point2) - int(point1)
-	else:
+    else:
-		difference = int(point1) - int(point2)
+        difference = int(point1) - int(point2)
-	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
+    return abs(difference)
 	return betrag(difference)
 # Get the absolute value of a number and returns it as int
 def betrag(number):
 	if number < 0:
 		number = int((-2 * number) / 2)
 	return number
 # Determine the highest int value in an array and returns is as an int
 def findHighest(data):
 	maximum = 0
 	for i in range(0, len(data)):
 		if int(data[i]) > maximum:
 			maximum = int(data[i])
 	return maximum
 def pp_calcdiff(data, clusterpoint):
-	max_diff = 0
+    max_diff = 0
-	new_cluster = 0
+    new_cluster = 0
-	for item in range(0,len(data)):
+    for item in range(0, len(data)):
-		if calcdiff(data[item], clusterpoint) > max_diff:
+        if calcdiff(data[item], clusterpoint) > max_diff:
-			max_diff = calcdiff(data[item], clusterpoint)
+            max_diff = calcdiff(data[item], clusterpoint)
-			new_cluster = data[item]
+            new_cluster = data[item]
-	return new_cluster
+    return new_cluster
 def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
-	max_diff = 0
+    max_diff = 0
-	new_cluster = 0
+    new_cluster = 0
-	for item in range(0,len(data)):
+    for item in range(0, len(data)):
-		if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
+        if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
-			max_diff = calcdiff(data[item], clusterpoint)
+            max_diff = calcdiff(data[item], clusterpoint)
-			new_cluster = data[item]
+            new_cluster = data[item]
-	return new_cluster
+    return new_cluster
--- a/src/algorithms/dmtest.py
+++ b/src/algorithms/dmtest.py
@@ -1,34 +1,38 @@
 # For random generation of numbers import randint
 from random import randint, shuffle
 # Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
 def plzGen(entries):
-	dataArray = []
+    dataArray = []
-	plz_lenght = 5
+    plz_lenght = 5
-	for i in range(0, int(entries)):
+    for i in range(0, int(entries)):
-		if i < round(entries * 0.4):
+        if i < round(entries * 0.4):
-			plz = generateNumber(plz_lenght, 2)
+            plz = generateNumber(plz_lenght, 2)
-		elif i >= round(entries * 0.4) and i < round(entries * 0.6):
+        elif i >= round(entries * 0.4) and i < round(entries * 0.6):
-			plz = generateNumber(plz_lenght, 9)
+            plz = generateNumber(plz_lenght, 9)
-		elif i >= round(entries * 0.6) and i < round(entries * 0.9):
+        elif i >= round(entries * 0.6) and i < round(entries * 0.9):
-			plz = generateNumber(plz_lenght, 4)
+            plz = generateNumber(plz_lenght, 4)
-		else:
+        else:
-			plz = generateNumber(plz_lenght, randint(0,9))
+            plz = generateNumber(plz_lenght, randint(0, 9))
-		dataArray.append(plz)
+        dataArray.append(plz)
-	shuffle(dataArray)
+    shuffle(dataArray)
-	return dataArray
+    return dataArray
 # Function for generating the content of one single row randomly
 def generateNumber(numberLenght, startingNumber):
-	number = str(startingNumber)
+    number = str(startingNumber)
-	for length in range(0, numberLenght - 1):
+    for length in range(0, numberLenght - 1):
-		number = number + str(randint(0,9))
+        number = number + str(randint(0, 9))
-	return number
+    return number
 # Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
 # /testdata/ folder has to be created at this point
 def writeFile(content, nameChunkStart, namePartStart):
-	filenumber = int(nameChunkStart) + int(namePartStart)
+    filenumber = int(nameChunkStart) + int(namePartStart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
+    file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
+    for w in range(0, len(content)):
-		file.write(content[w] + "\n")
+        file.write(content[w] + "\n")
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -1,19 +1,18 @@
 #!/usr/bin/env python
-#title:				kmeansMkI.py
+# title:            kmeansMkI.py
-#description:		Our personal Python K-Means++ implementation
+# description:      Our personal Python K-Means++ implementation
-#author:			Tillmann Brendel, Conrad Großer
+# author:           Tillmann Brendel, Conrad Großer
-#license:			Pending
+# license:          Pending
-#date:				26.05.2018
+# date:             26.05.2018
-#version:			1.2
+# version:          1.2
-#usage:				python pyscript.py
+# usage:            python pyscript.py
-#notes:
+# notes:
-#dependencies:		mathplotlib
+# dependencies:     mathplotlib
-#known_issues:
+# known_issues:
-#python_version:	3.x
+# python_version:   3.x
-#==============================================================================
+# ==============================================================================
 # IMPORTS
 # Importing the time for benchmarking purposes
 import time
 from datetime import date
@@ -28,111 +27,120 @@ import matplotlib.pyplot as plt
 import dmlib
 import dmtest
-# CODE
+
 # Main function of the algorithm
 def kmeansmk1(data, clusters):
-	# Defining cluster points
+    globals()["cpoint_0"] = data[randint(0, len(data))]
-	for i in range(0, clusters):
+    globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
 		globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
 		print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
-	# Get max value in the data array
+    print("Initial cluster 1: " + str(globals()["cpoint_0"]))
-	highPoint = dmlib.findHighest(data)
+    print("Initial cluster 2: " + str(globals()["cpoint_1"]))
-	# Define variables for running the algorithm (runs is just for benchmarking!)
+    # Defining cluster points
-	done = 0
+    for i in range(2, clusters):
-	runs = 0
+        globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
        print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
-	# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
+    # Get max value in the data array
-	while done == 0:
+    highPoint = max(data)
 		runs = runs + 1
 		new_data = assignCluster(data, highPoint, clusters)
 		done = calcClusters(new_data, clusters)
-	# Printing final clusters
+    # Define variables for running the algorithm (runs is just for benchmarking!)
-	for i in range(0, clusters):
+    done, runs = False, 0
 		print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
-	# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
+    # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
-	anew = []
+    while not done:
-	inew = 0
+        runs += 1
-	while inew < len(data):
+        new_data = assignCluster(data, highPoint, clusters)
-		anew.append(inew)
+        done = calcClusters(new_data, clusters)
 		inew = inew + 1
-	# Drawing found clusters as lines
+    # Printing final clusters
-	for i in range(0, clusters):
+    for i in range(0, clusters):
-		plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
+        print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
-	# Showing graph
+    # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
-	plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
+    anew, inew = [], 0
-	plt.show()
+
    while inew < len(data):
        anew.append(inew)
        inew += 1
    # Drawing found clusters as lines
    for i in range(0, clusters):
        plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
    # Showing graph
    plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
    plt.show()
    return 0
 	return 0
 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
 def calcClusters(data, clusters):
-	changed = 0
+    changed = False
-	for cluster in range(0, clusters):
+    for cluster in range(0, clusters):
-		# Getting current cluster and saving it in temporary variable
+        # Getting current cluster and saving it in temporary variable
-		prev_cluster = globals()["cpoint_" + str(cluster)]
+        prev_cluster = globals()["cpoint_" + str(cluster)]
-		# Sum of the cluster to calculate average difference between cluster center and data points 
+        # Sum of the cluster to calculate average difference between cluster center and data points
-		clustersum = 0
+        clustersum = 0
-		item_count = 0
+        item_count = 0
-		for item in range(0, len(data[0])):
+        for item in range(0, len(data[0])):
-			if data[1][item] == globals()["cpoint_" + str(cluster)]:
+            if data[1][item] == globals()["cpoint_" + str(cluster)]:
-				clustersum = clustersum + int(data[0][item])
+                clustersum = clustersum + int(data[0][item])
-				item_count = item_count + 1
+                item_count = item_count + 1
-		globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
+        globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
-		# Checking if previous clusterpoint is equal to the one just calculated
+        # Checking if previous clusterpoint is equal to the one just calculated
-		if prev_cluster == globals()["cpoint_" + str(cluster)]:
+        if prev_cluster == globals()["cpoint_" + str(cluster)]:
-			changed = 1
+            changed = True
    return changed
 	return changed
 def assignCluster(data, highPoint, clusters):
-	# Create a new data array for working
+    # Create a new data array for working
-	new_data = []
+    new_data = [data]
 	new_data.append(data)
-	# Create new array for assigned clusters of each value
+    # Create new array for assigned clusters of each value
-	data_assigned = []
+    data_assigned = []
-	# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
+    # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
-	for item in range(0, len(new_data[0])):
+    for item in data:
-		# Set the minimal cluster difference to the highest difference in the list to ease comparision
+        # Set the minimal cluster difference to the highest difference in the list to ease comparision
-		min_cluster = highPoint
+        min_cluster = highPoint
-		# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference 
+        # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
-		for cluster in range(0, clusters):
+        for cluster in range(0, clusters):
-			if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
+            if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]):
-				min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
+                min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)])
-				assinged_cluster = globals()["cpoint_" + str(cluster)]
+                assinged_cluster = globals()["cpoint_" + str(cluster)]
-		# Assign the minimal difference cluster to the data
+
-		data_assigned.append(assinged_cluster)
+        # Assign the minimal difference cluster to the data
-	# Add the assigned values list to the new_data array
+        data_assigned.append(assinged_cluster)
-	new_data.append(data_assigned)
+    # Add the assigned values list to the new_data array
    new_data.append(data_assigned)
    return new_data
 	return new_data
 # Startup function for collecting necesarry data
 def startup(data):
-	# Using two clusters for testing
+    # Using two clusters for testing
-	clusters = int(input("How many clusters are known? "))
+    clusters = int(input("How many clusters are known? "))
-	# cores = input("How many cores should be used? ")
+    # cores = input("How many cores should be used? ")
-	# path = input("Where is the data? ") or in this case data
+    # path = input("Where is the data? ") or in this case data
 	# For benchmarking starting the timer now
 	start_time = time.time()
-	# Firing up the engines!
+    # For benchmarking starting the timer now
-	kmeansmk1(data, clusters)
+    start_time = time.time()
    # Firing up the engines!
    kmeansmk1(data, clusters)
    # Stopping benchmark
    seconds = time.time() - start_time
    print(str(seconds) + " seconds for execution")
 	# Stopping benchmark
 	seconds = time.time() - start_time
 	print(str(seconds) + " seconds for execution")
 # Start the algorithm and generate test data
 data = dmtest.plzGen(10000)
--- a/src/data_generators/randomi.py
+++ b/src/data_generators/randomi.py
@@ -10,69 +10,79 @@ from datetime import date
 # Importing for multi core processing
 import multiprocessing
 # randomI function which creates each file
 def randomI(units, rows, rowLength, partstart):
-	for setcounter in range(0, units):
+    for setcounter in range(0, units):
-		writeFile(generateFile(rows, rowLength), setcounter, partstart)
+        writeFile(generateFile(rows, rowLength), setcounter, partstart)
    return True
 # Function for generating the content of one single file
 def generateFile(rows, rowLength):
-	content = []
+    content = []
-	for y in range(0, rows):
+    for entry in rows:
-		content.append(generateRow(rowLength))
+        content.append(generateRow(rowLength))
-	return content
+    return content
 # Function for generating the content of one single row randomly
 def generateRow(rowLength):
-	row = ""
+    row = ""
-	for z in range(0, rowLength):
+    for z in range(0, rowLength):
-		row = row + str(randint(0, 9))
+        row += str(randint(0, 9))
-	return row
+    return row
 # Function for writing data into a file
 def writeFile(content, setcounter, partstart):
-	filenumber = int(setcounter) + int(partstart)
+    filenumber = int(setcounter) + int(partstart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
+    file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
+    for line in content:
-		file.write(content[w] + "\n")
+        file.write(line + "\n")
    return True
 if __name__ == '__main__':
-	# Getting the user input
+    # Getting the user input
-	print("Hello World")
+    print("Hello World")
-	units = int(input("How many units would you like to generate? "))
+    units = int(input("How many units would you like to generate? "))
-	rows = int(input("How many rows should each unit have? "))
+    rows = int(input("How many rows should each unit have? "))
-	rowLength = int(input("How long should each row be? "))
+    rowLength = int(input("How long should each row be? "))
-	cores = int(input("How many cores do you want to use? "))
+    cores = int(input("How many cores do you want to use? "))
-	# Splitting up the units
+    # Splitting up the units
-	count = int(0)
+    count = 0
-	partsize = units / cores
+    partsize = units / cores
-	# For benchmarking starting the timer now
+    # For benchmarking starting the timer now
-	start_time = time.time()
+    start_time = time.time()
-	# Initialize and prepare cores for process
+    # Initialize and prepare cores for process
-	while count < cores:
+    while count < cores:
-		partstart = partsize * count
+        partstart = partsize * count
-		globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart))
+        globals()["p" + str(count)] = multiprocessing.Process(
-		count = count + 1
+            target=randomI,
            args=(int(partsize), rows, rowLength, partstart)
            )
        count += 1
-	# Starting each core
+    # Starting each core
-	count = int(0)
+    count = 0
-	while count < cores:
+    while count < cores:
-		globals()["p" + str(count)].start()
+        globals()["p" + str(count)].start()
-		print("Core " + str(count) + " started.")
+        print("Core " + str(count) + " started.")
-		count = count + 1
+        count += 1
-	print("Working...")
+    print("Working...")
-	# Joining each core for the process
+    # Joining each core for the process
-	count = int(0)
+    count = 0
-	while count < cores:
+    while count < cores:
-		globals()["p" + str(count)].join()
+        globals()["p" + str(count)].join()
-		count = count + 1
+        count += 1
-	# Finishing up the process
+    # Finishing up the process
-	sec = time.time() - start_time
+    sec = time.time() - start_time
-	print("Data is generated. Have fun!")	
+    print("Data is generated. Have fun!")
-	print("randomI took " + str(sec) + " seconds for execution.")
+    print("randomI took " + str(sec) + " seconds for execution.")
--- a/src/data_generators/randomi2.1.py
+++ b/src/data_generators/randomi2.1.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python
-#title:				randomI2.1.py
+# title:              randomI2.1.py
-#description:		Personal 
+# description:        Personal
-#author:			Tillmann Brendel, Conrad Großer
+# author:             Tillmann Brendel, Conrad Großer
-#license:			Pending
+# license:            Pending
-#date:				26.05.2018
+# date:               26.05.2018
-#version:			1.0
+# version:            1.0
-#usage:				python pyscript.py
+# usage:              python pyscript.py
-#notes:
+# notes:
-#known_issues:
+# known_issues:
-#python_version:	3.x
+# python_version:    3.x
-#==============================================================================
+# ==============================================================================
 # For random generation of numbers import randint
 from random import randint
@@ -21,76 +21,87 @@ from datetime import date
 # Importing for multi core processing
 import multiprocessing
 # randomI function which creates each file
 def randomI(units, rows, rowLength, partstart, cluster):
-	for setcounter in range(0, units):
+    for setcounter in range(0, units):
-		writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
+        writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
    return True
 # Function for generating the content of one single file
 def generateFile(rows, rowLength, cluster):
-	content = []
+    content = []
-	for y in range(0, rows):
+    for entry in rows:
-		if y == 0:
+        if entry == 0:
-			if 1 == randint(1, cluster):
+            if randint(1, cluster) == 1:
-				content.append(generate09())
+                content.append(generate09())
-			else:
+            else:
-				content.append(generatePLZ())
+                content.append(generatePLZ())
-		else:
+        else:
-			content.append(generateRow(rowLength))
+            content.append(generateRow(rowLength))
-	return content
+    return content
 # Function for generating the content of one single row randomly
 def generateRow(rowLength):
-	row = ""
+    row = ''
-	for z in range(0, rowLength):
+    for z in range(0, rowLength):
-		row = row + str(randint(0, 9))
+        row += str(randint(0, 9))
-	return row
+    return row
 # Function for writing data into a file (content = string, setcount and partstart are for better naming)
 def writeFile(content, setcounter, partstart):
-	filenumber = int(setcounter) + int(partstart)
+    filenumber = int(setcounter) + int(partstart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
+    file = open('testdata/file' + str(filenumber) + '.txt', 'w')
-	for w in range(0, len(content)):
+
-		file.write(content[w] + "\n")
+    for line in content:
        file.write(line + '\n')
    return True
 if __name__ == '__main__':
-	# Getting the user input
+    # Getting the user input
-	print("Hello World")
+    print('Hello World')
-	units = int(input("How many units would you like to generate? "))
+    units = int(input('How many units would you like to generate? '))
-	rows = int(input("How many rows should each unit have? "))
+    rows = int(input('How many rows should each unit have? '))
-	rowLength = int(input("How long should each row be? "))
+    rowLength = int(input('How long should each row be? '))
-	cores = int(input("How many cores do you want to use? "))
+    cores = int(input('How many cores do you want to use? '))
-	cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/"))
+    cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/'))
    # Splitting up the units
-	count = int(0)
+    count = 0
-	partsize = units / cores
+    partsize = units / cores
-	# For benchmarking starting the timer now
+    # For benchmarking starting the timer now
-	start_time = time.time()
+    start_time = time.time()
-	# Initialize and prepare cores for process
+    # Initialize and prepare cores for process
-	while count < cores:
+    while count < cores:
-		partstart = partsize * count
+        partstart = partsize * count
-		globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster))
+        globals()['p' + str(count)] = multiprocessing.Process(
-		count = count + 1
+            target=randomI,
            args=(int(partsize), rows, rowLength, partstart, cluster)
            )
        count += 1
-	# Starting each core
+    # Starting each core
-	count = int(0)
+    count = int(0)
-	while count < cores:
+    while count < cores:
-		globals()["p" + str(count)].start()
+        globals()['p' + str(count)].start()
-		print("Core " + str(count) + " started.")
+        print('Core ' + str(count) + ' started.')
-		count = count + 1
+        count += 1
-	print("Working...")
+    print('Working...')
-	# Joining each core for the process
+    # Joining each core for the process
-	count = int(0)
+    count = 0
-	while count < cores:
+    while count < cores:
-		globals()["p" + str(count)].join()
+        globals()['p' + str(count)].join()
-		count = count + 1
+        count += 1
-	# Finishing up the process
+    # Finishing up the process
-	sec = time.time() - start_time
+    sec = time.time() - start_time
-	print("Data is generated. Have fun!")
+    print('Data is generated. Have fun!')
-	print("randomI took " + str(sec) + " seconds for execution.")
+    print('randomI took ' + str(sec) + ' seconds for execution.')
Author	SHA1	Message	Date
Conrad	94ed193954	Update .gitignore	2019-04-25 12:45:05 +02:00
Conrad	e2ad63f90f	Code Rework	2019-04-25 12:43:37 +02:00
Conrad	25fa068df9	kmeans ++ Update - Added concept for kmeans ++	2018-06-02 23:31:43 +02:00