Update .gitignore

Code Rework
kmeans ++ Update
2019-04-25 12:45:05 +02:00 · 2019-04-25 12:43:37 +02:00 · 2018-06-02 23:31:43 +02:00
6 changed files with 276 additions and 255 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 testdata/
 __pycache__/
+.DS_Store
--- a/src/algorithms/dmlib.py
+++ b/src/algorithms/dmlib.py
@@ -4,36 +4,23 @@ def calcdiff(point1, point2):
        difference = int(point2) - int(point1)
    else:
        difference = int(point1) - int(point2)
-	# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
-	return betrag(difference)
+    return abs(difference)

-# Get the absolute value of a number and returns it as int
-def betrag(number):
-	if number < 0:
-		number = int((-2 * number) / 2)
-	return number
-
-# Determine the highest int value in an array and returns is as an int
-def findHighest(data):
-	maximum = 0
-	for i in range(0, len(data)):
-		if int(data[i]) > maximum:
-			maximum = int(data[i])
-	return maximum

 def pp_calcdiff(data, clusterpoint):
    max_diff = 0
    new_cluster = 0
-	for item in range(0,len(data)):
+    for item in range(0, len(data)):
        if calcdiff(data[item], clusterpoint) > max_diff:
            max_diff = calcdiff(data[item], clusterpoint)
            new_cluster = data[item]
    return new_cluster

+
 def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
    max_diff = 0
    new_cluster = 0
-	for item in range(0,len(data)):
+    for item in range(0, len(data)):
        if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
            max_diff = calcdiff(data[item], clusterpoint)
            new_cluster = data[item]
--- a/src/algorithms/dmtest.py
+++ b/src/algorithms/dmtest.py
@@ -1,6 +1,7 @@
 # For random generation of numbers import randint
 from random import randint, shuffle

+
 # Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
 def plzGen(entries):
    dataArray = []
@@ -13,18 +14,20 @@ def plzGen(entries):
        elif i >= round(entries * 0.6) and i < round(entries * 0.9):
            plz = generateNumber(plz_lenght, 4)
        else:
-			plz = generateNumber(plz_lenght, randint(0,9))
+            plz = generateNumber(plz_lenght, randint(0, 9))
        dataArray.append(plz)
    shuffle(dataArray)
    return dataArray

+
 # Function for generating the content of one single row randomly
 def generateNumber(numberLenght, startingNumber):
    number = str(startingNumber)
    for length in range(0, numberLenght - 1):
-		number = number + str(randint(0,9))
+        number = number + str(randint(0, 9))
    return number

+
 # Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
 # /testdata/ folder has to be created at this point
 def writeFile(content, nameChunkStart, namePartStart):
@@ -32,3 +35,4 @@ def writeFile(content, nameChunkStart, namePartStart):
    file = open("testdata/file" + str(filenumber) + ".txt", "w")
    for w in range(0, len(content)):
        file.write(content[w] + "\n")
+
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -1,19 +1,18 @@
 #!/usr/bin/env python
-#title:				kmeansMkI.py
-#description:		Our personal Python K-Means++ implementation
-#author:			Tillmann Brendel, Conrad Großer
-#license:			Pending
-#date:				26.05.2018
-#version:			1.2
-#usage:				python pyscript.py
-#notes:
-#dependencies:		mathplotlib
-#known_issues:
-#python_version:	3.x
-#==============================================================================
+# title:            kmeansMkI.py
+# description:      Our personal Python K-Means++ implementation
+# author:           Tillmann Brendel, Conrad Großer
+# license:          Pending
+# date:             26.05.2018
+# version:          1.2
+# usage:            python pyscript.py
+# notes:
+# dependencies:     mathplotlib
+# known_issues:
+# python_version:   3.x
+# ==============================================================================

 # IMPORTS
-
 # Importing the time for benchmarking purposes
 import time
 from datetime import date
@@ -28,24 +27,29 @@ import matplotlib.pyplot as plt
 import dmlib
 import dmtest

-# CODE
+
 # Main function of the algorithm
 def kmeansmk1(data, clusters):
+    globals()["cpoint_0"] = data[randint(0, len(data))]
+    globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
+
+    print("Initial cluster 1: " + str(globals()["cpoint_0"]))
+    print("Initial cluster 2: " + str(globals()["cpoint_1"]))
+
    # Defining cluster points
-	for i in range(0, clusters):
-		globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
+    for i in range(2, clusters):
+        globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
        print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))

    # Get max value in the data array
-	highPoint = dmlib.findHighest(data)
+    highPoint = max(data)

    # Define variables for running the algorithm (runs is just for benchmarking!)
-	done = 0
-	runs = 0
+    done, runs = False, 0

    # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
-	while done == 0:
-		runs = runs + 1
+    while not done:
+        runs += 1
        new_data = assignCluster(data, highPoint, clusters)
        done = calcClusters(new_data, clusters)

@@ -54,11 +58,11 @@ def kmeansmk1(data, clusters):
        print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")

    # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
-	anew = []
-	inew = 0
+    anew, inew = [], 0
+
    while inew < len(data):
        anew.append(inew)
-		inew = inew + 1
+        inew += 1

    # Drawing found clusters as lines
    for i in range(0, clusters):
@@ -70,9 +74,10 @@ def kmeansmk1(data, clusters):

    return 0

+
 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
 def calcClusters(data, clusters):
-	changed = 0
+    changed = False
    for cluster in range(0, clusters):
        # Getting current cluster and saving it in temporary variable
        prev_cluster = globals()["cpoint_" + str(cluster)]
@@ -88,28 +93,29 @@ def calcClusters(data, clusters):

        # Checking if previous clusterpoint is equal to the one just calculated
        if prev_cluster == globals()["cpoint_" + str(cluster)]:
-			changed = 1
+            changed = True

    return changed

+
 def assignCluster(data, highPoint, clusters):
    # Create a new data array for working
-	new_data = []
-	new_data.append(data)
+    new_data = [data]

    # Create new array for assigned clusters of each value
    data_assigned = []

    # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
-	for item in range(0, len(new_data[0])):
+    for item in data:
        # Set the minimal cluster difference to the highest difference in the list to ease comparision
        min_cluster = highPoint

        # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
        for cluster in range(0, clusters):
-			if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
-				min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
+            if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]):
+                min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)])
                assinged_cluster = globals()["cpoint_" + str(cluster)]
+
        # Assign the minimal difference cluster to the data
        data_assigned.append(assinged_cluster)
    # Add the assigned values list to the new_data array
@@ -117,6 +123,7 @@ def assignCluster(data, highPoint, clusters):

    return new_data

+
 # Startup function for collecting necesarry data
 def startup(data):
    # Using two clusters for testing
@@ -134,6 +141,7 @@ def startup(data):
    seconds = time.time() - start_time
    print(str(seconds) + " seconds for execution")

+
 # Start the algorithm and generate test data
 data = dmtest.plzGen(10000)

--- a/src/data_generators/randomi.py
+++ b/src/data_generators/randomi.py
@@ -10,31 +10,38 @@ from datetime import date
 # Importing for multi core processing
 import multiprocessing

+
 # randomI function which creates each file
 def randomI(units, rows, rowLength, partstart):
    for setcounter in range(0, units):
        writeFile(generateFile(rows, rowLength), setcounter, partstart)
+    return True
+

 # Function for generating the content of one single file
 def generateFile(rows, rowLength):
    content = []
-	for y in range(0, rows):
+    for entry in rows:
        content.append(generateRow(rowLength))
    return content

+
 # Function for generating the content of one single row randomly
 def generateRow(rowLength):
    row = ""
    for z in range(0, rowLength):
-		row = row + str(randint(0, 9))
+        row += str(randint(0, 9))
    return row

+
 # Function for writing data into a file
 def writeFile(content, setcounter, partstart):
    filenumber = int(setcounter) + int(partstart)
    file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
+    for line in content:
+        file.write(line + "\n")
+    return True
+

 if __name__ == '__main__':
    # Getting the user input
@@ -45,7 +52,7 @@ if __name__ == '__main__':
    cores = int(input("How many cores do you want to use? "))

    # Splitting up the units
-	count = int(0)
+    count = 0
    partsize = units / cores

    # For benchmarking starting the timer now
@@ -54,23 +61,26 @@ if __name__ == '__main__':
    # Initialize and prepare cores for process
    while count < cores:
        partstart = partsize * count
-		globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart))
-		count = count + 1
+        globals()["p" + str(count)] = multiprocessing.Process(
+            target=randomI,
+            args=(int(partsize), rows, rowLength, partstart)
+            )
+        count += 1

    # Starting each core
-	count = int(0)
+    count = 0
    while count < cores:
        globals()["p" + str(count)].start()
        print("Core " + str(count) + " started.")
-		count = count + 1
+        count += 1

    print("Working...")

    # Joining each core for the process
-	count = int(0)
+    count = 0
    while count < cores:
        globals()["p" + str(count)].join()
-		count = count + 1
+        count += 1

    # Finishing up the process
    sec = time.time() - start_time
--- a/src/data_generators/randomi2.1.py
+++ b/src/data_generators/randomi2.1.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python
-#title:				randomI2.1.py
-#description:		Personal 
-#author:			Tillmann Brendel, Conrad Großer
-#license:			Pending
-#date:				26.05.2018
-#version:			1.0
-#usage:				python pyscript.py
-#notes:
-#known_issues:
-#python_version:	3.x
-#==============================================================================
+# title:              randomI2.1.py
+# description:        Personal
+# author:             Tillmann Brendel, Conrad Großer
+# license:            Pending
+# date:               26.05.2018
+# version:            1.0
+# usage:              python pyscript.py
+# notes:
+# known_issues:
+# python_version:    3.x
+# ==============================================================================

 # For random generation of numbers import randint
 from random import randint
@@ -21,17 +21,20 @@ from datetime import date
 # Importing for multi core processing
 import multiprocessing

+
 # randomI function which creates each file
 def randomI(units, rows, rowLength, partstart, cluster):
    for setcounter in range(0, units):
        writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
+    return True
+

 # Function for generating the content of one single file
 def generateFile(rows, rowLength, cluster):
    content = []
-	for y in range(0, rows):
-		if y == 0:
-			if 1 == randint(1, cluster):
+    for entry in rows:
+        if entry == 0:
+            if randint(1, cluster) == 1:
                content.append(generate09())
            else:
                content.append(generatePLZ())
@@ -39,31 +42,36 @@ def generateFile(rows, rowLength, cluster):
            content.append(generateRow(rowLength))
    return content

+
 # Function for generating the content of one single row randomly
 def generateRow(rowLength):
-	row = ""
+    row = ''
    for z in range(0, rowLength):
-		row = row + str(randint(0, 9))
+        row += str(randint(0, 9))
    return row

+
 # Function for writing data into a file (content = string, setcount and partstart are for better naming)
 def writeFile(content, setcounter, partstart):
    filenumber = int(setcounter) + int(partstart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
+    file = open('testdata/file' + str(filenumber) + '.txt', 'w')
+
+    for line in content:
+        file.write(line + '\n')
+    return True
+

 if __name__ == '__main__':
    # Getting the user input
-	print("Hello World")
-	units = int(input("How many units would you like to generate? "))
-	rows = int(input("How many rows should each unit have? "))
-	rowLength = int(input("How long should each row be? "))
-	cores = int(input("How many cores do you want to use? "))
-	cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/"))
+    print('Hello World')
+    units = int(input('How many units would you like to generate? '))
+    rows = int(input('How many rows should each unit have? '))
+    rowLength = int(input('How long should each row be? '))
+    cores = int(input('How many cores do you want to use? '))
+    cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/'))

    # Splitting up the units
-	count = int(0)
+    count = 0
    partsize = units / cores

    # For benchmarking starting the timer now
@@ -72,25 +80,28 @@ if __name__ == '__main__':
    # Initialize and prepare cores for process
    while count < cores:
        partstart = partsize * count
-		globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster))
-		count = count + 1
+        globals()['p' + str(count)] = multiprocessing.Process(
+            target=randomI,
+            args=(int(partsize), rows, rowLength, partstart, cluster)
+            )
+        count += 1

    # Starting each core
    count = int(0)
    while count < cores:
-		globals()["p" + str(count)].start()
-		print("Core " + str(count) + " started.")
-		count = count + 1
+        globals()['p' + str(count)].start()
+        print('Core ' + str(count) + ' started.')
+        count += 1

-	print("Working...")
+    print('Working...')

    # Joining each core for the process
-	count = int(0)
+    count = 0
    while count < cores:
-		globals()["p" + str(count)].join()
-		count = count + 1
+        globals()['p' + str(count)].join()
+        count += 1

    # Finishing up the process
    sec = time.time() - start_time
-	print("Data is generated. Have fun!")
-	print("randomI took " + str(sec) + " seconds for execution.")
+    print('Data is generated. Have fun!')
+    print('randomI took ' + str(sec) + ' seconds for execution.')
Author	SHA1	Message	Date
Conrad	94ed193954	Update .gitignore	2019-04-25 12:45:05 +02:00
Conrad	e2ad63f90f	Code Rework	2019-04-25 12:43:37 +02:00
Conrad	25fa068df9	kmeans ++ Update - Added concept for kmeans ++	2018-06-02 23:31:43 +02:00