Reworked Code

2019-04-25 12:06:06 +02:00
parent 7d31468c43
commit 37daadb020
4 changed files with 178 additions and 172 deletions
--- a/src/algorithms/dmlib.py
+++ b/src/algorithms/dmlib.py
@@ -1,5 +1,7 @@
+# Imports for dmlib
 import math

+
 # Calculate the difference between two points giving the indexes of these xdata entries
 def calcdiff(point1, point2):
    if int(point2) > int(point1):
@@ -8,23 +10,10 @@ def calcdiff(point1, point2):
        difference = int(point1) - int(point2)
    return difference

+
 # Calculate the difference between two points in 2D space
 def calcdiff2d(point1, point2):
    point1 = [int(i) for i in point1]
    point2 = [int(i) for i in point2]
    difference = math.sqrt(((point2[0]) - (point1[0])) ** 2 + ((point2[1]) - (point1[1])) ** 2)
-    return betrag(difference)
-
-# Get the absolute value of a number and returns it as int
-def betrag(number):
-    if number < 0:
-        number = int((-2 * number) / 2)
-    return number
-
-# Determine the highest int value in an array and returns is as an int
-def findHighest(data):
-    maximum = 0
-    for i in range(0, len(data)):
-        if int(data[i]) > maximum:
-            maximum = int(data[i])
-    return maximum
+    return abs(difference)
--- a/src/algorithms/dmtest.py
+++ b/src/algorithms/dmtest.py
@@ -1,55 +1,64 @@
 # For random generation of numbers import randint
 from random import randint, shuffle

+
 # Simple generator for test nums (40-40-20 biased), returns 1D array of nums
 def numGenLight(entries, shuffle, num_lenght):
-	dataArray = []
-	for i in range(0, int(entries)):
-		if i < round(entries * 0.4):
-			num = generateNumber(num_lenght, 2)
-		elif i >= round(entries * 0.4) and i < round(entries * 0.6):
-			num = generateNumber(num_lenght, 9)
-		elif i >= round(entries * 0.6) and i < round(entries * 0.9):
-			num = generateNumber(num_lenght, 4)
-		else:
-			num = generateNumber(num_lenght, randint(0,9))
-		dataArray.append(num)
-	if shuffle:
-		shuffle(dataArray)
-	return dataArray
+    dataArray = []
+    for i in range(0, int(entries)):
+        if i < round(entries * 0.4):
+            num = generateNumber(num_lenght, 2)
+        elif i >= round(entries * 0.4) and i < round(entries * 0.6):
+            num = generateNumber(num_lenght, 9)
+        elif i >= round(entries * 0.6) and i < round(entries * 0.9):
+            num = generateNumber(num_lenght, 4)
+        else:
+            num = generateNumber(num_lenght, randint(0, 9))
+        dataArray.append(num)
+    if shuffle:
+        shuffle(dataArray)
+    return dataArray
+

 # Function for generating the content of one single row randomly
 def generateNumber(numberLenght, startingNumber):
-	number = str(startingNumber)
-	for length in range(0, numberLenght - 1):
-		number = number + str(randint(0,9))
-	return number
+    number = str(startingNumber)
+    for length in range(0, numberLenght - 1):
+        number = number + str(randint(0, 9))
+    return number

-# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
+
+# Function for writing data into a file 
+# content = string, nameChunkStart and namePartStart are for better naming
 # /testdata/ folder has to be created at this point
 def writeFile(content, nameChunkStart, namePartStart):
-	filenumber = int(nameChunkStart) + int(namePartStart)
-	file = open("testdata/file" + str(filenumber) + ".txt", "w")
-	for w in range(0, len(content)):
-		file.write(content[w] + "\n")
+    filenumber = int(nameChunkStart) + int(namePartStart)
+    file = open("testdata/file" + str(filenumber) + ".txt", "w")
+    for w in range(0, len(content)):
+        file.write(content[w] + "\n")
+

 # Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters
 def numGen(entries, cluster, int_lenght, suffle_value):
-	dataArray = []
-	clusterArray = []
+    dataArray = []
+    clusterArray = []

-	for cluster_num in range(0, cluster):
-		clusterArray.append(randint(10,99))
+    for cluster_num in range(0, cluster):
+        clusterArray.append(randint(10, 99))

-	for item in range(0, entries):
-		decider = randint(0, 2)
-		if decider == 2:
-			dataArray.append(generateNumber(int_lenght, randint(1,9)))
-		else:
-			cluster_decider = randint(0, cluster - 1)
-			dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
+    for item in range(0, entries):
+        decider = randint(0, 2)
+        if decider == 2:
+            dataArray.append(generateNumber(int_lenght, randint(1, 9)))
+        else:
+            cluster_decider = randint(0, cluster - 1)
+            dataArray.append(
+                generateNumber(
+                    int_lenght - 1,
+                    clusterArray[cluster_decider]
+                    ))

-	if suffle_value:
-		shuffle(dataArray)
+    if suffle_value:
+        shuffle(dataArray)

-	return dataArray
+    return dataArray
--- a/src/algorithms/kmeansMkI.py
+++ b/src/algorithms/kmeansMkI.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python
-#title:				kmeansMkI.py
-#description:		Our personal Python K-Means++ implementation
-#author:			Tillmann Brendel, Conrad Großer
-#license:			Pending
-#date:				26.05.2018
-#version:			1.2
-#usage:				python pyscript.py
-#notes:
-#dependencies:		mathplotlib
-#known_issues:		When clusters are 'thin' or noice is to strong --> unaccurate
-#python_version:	3.x
-#==============================================================================
+# title:                kmeansMkI.py
+# description:        Our personal Python K-Means++ implementation
+# author:            Tillmann Brendel, Conrad Großer
+# license:            Pending
+# date:                26.05.2018
+# version:            1.3
+# usage:                python pyscript.py
+# notes:
+# dependencies:        matplotlib
+# known_issues:        When clusters are 'thin' or noice is to strong --> inaccurate
+# python_version:    3.x
+# ==============================================================================

 # IMPORTS

@@ -28,112 +28,117 @@ import matplotlib.pyplot as plt
 import dmlib
 import dmtest

+
 # CODE
 # Main function of the algorithm
 def kmeansmk1(data, clusters):
-	# Defining cluster points
-	for i in range(0, clusters):
-		globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
-		print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
+    # Defining cluster points
+    for i in range(0, clusters):
+        globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
+        print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))

-	# Get max value in the data array
-	highPoint = dmlib.findHighest(data)
+    # Get max value in the data array
+    highPoint = max(data)

-	# Define variables for running the algorithm (runs is just for benchmarking!)
-	done = 0
-	runs = 0
+    # Define variables for running the algorithm (runs is just for benchmarking!)
+    done = False
+    runs = 0

-	# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
-	while done == 0:
-		runs = runs + 1
-		new_data = assignCluster(data, highPoint, clusters)
-		done = calcClusters(new_data, clusters)
+    # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
+    while not done:
+        runs += 1
+        new_data = assignCluster(data, highPoint, clusters)
+        done = calcClusters(new_data, clusters)

-	# Printing final clusters
-	for i in range(0, clusters):
-		print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
+    # Printing final clusters
+    for i in range(0, clusters):
+        print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")

-	# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
-	anew = []
-	inew = 0
-	while inew < len(data):
-		anew.append(inew)
-		inew = inew + 1
+    # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
+    anew = []
+    inew = 0
+    while inew < len(data):
+        anew.append(inew)
+        inew = inew + 1

-	# Drawing found clusters as lines
-	for i in range(0, clusters):
-		plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
+    # Drawing found clusters as lines
+    for i in range(0, clusters):
+        plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')

-	# Showing graph
-	plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
-	plt.show()
+    # Showing graph
+    plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
+    plt.show()
+
+    return 0

-	return 0

 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
 def calcClusters(data, clusters):
-	changed = 0
-	for cluster in range(0, clusters):
-		# Getting current cluster and saving it in temporary variable
-		prev_cluster = globals()["cpoint_" + str(cluster)]
-		# Sum of the cluster to calculate average difference between cluster center and data points 
-		clustersum = 0
-		item_count = 0
+    changed = False
+    for cluster in range(0, clusters):
+        # Getting current cluster and saving it in temporary variable
+        prev_cluster = globals()["cpoint_" + str(cluster)]
+        # Sum of the cluster to calculate average difference between cluster center and data points
+        clustersum = 0
+        item_count = 0

-		for item in range(0, len(data[0])):
-			if data[1][item] == globals()["cpoint_" + str(cluster)]:
-				clustersum = clustersum + int(data[0][item])
-				item_count = item_count + 1
-		globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
+        for item in range(0, len(data[0])):
+            if data[1][item] == globals()["cpoint_" + str(cluster)]:
+                clustersum = clustersum + int(data[0][item])
+                item_count = item_count + 1
+        globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)

-		# Checking if previous clusterpoint is equal to the one just calculated
-		if prev_cluster == globals()["cpoint_" + str(cluster)]:
-			changed = 1
+        # Checking if previous clusterpoint is equal to the one just calculated
+        if prev_cluster == globals()["cpoint_" + str(cluster)]:
+            changed = True
+
+    return changed

-	return changed

 def assignCluster(data, highPoint, clusters):
-	# Create a new data array for working
-	new_data = []
-	new_data.append(data)
+    # Create a new data array for working
+    new_data = []
+    new_data.append(data)

-	# Create new array for assigned clusters of each value
-	data_assigned = []
+    # Create new array for assigned clusters of each value
+    data_assigned = []

-	# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
-	for item in range(0, len(new_data[0])):
-		# Set the minimal cluster difference to the highest difference in the list to ease comparision
-		min_cluster = highPoint
+    # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
+    for item in range(0, len(new_data[0])):
+        # Set the minimal cluster difference to the highest difference in the list to ease comparision
+        min_cluster = highPoint

-		# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference 
-		for cluster in range(0, clusters):
-			if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
-				min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
-				assinged_cluster = globals()["cpoint_" + str(cluster)]
-		# Assign the minimal difference cluster to the data
-		data_assigned.append(assinged_cluster)
-	# Add the assigned values list to the new_data array
-	new_data.append(data_assigned)
+        # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
+        for cluster in range(0, clusters):
+            if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
+                min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
+                assinged_cluster = globals()["cpoint_" + str(cluster)]
+        # Assign the minimal difference cluster to the data
+        data_assigned.append(assinged_cluster)
+    # Add the assigned values list to the new_data array
+    new_data.append(data_assigned)
+
+    return new_data

-	return new_data

 # Startup function for collecting necesarry data
 def startup(data):
-	# Using two clusters for testing
-	clusters = int(input("How many clusters are known? "))
-	# cores = input("How many cores should be used? ")
-	# path = input("Where is the data? ") or in this case data
-	
-	# For benchmarking starting the timer now
-	start_time = time.time()
+    # Using two clusters for testing
+    clusters = int(input("How many clusters are known? "))
+    # cores = input("How many cores should be used? ")
+    # path = input("Where is the data? ") or in this case data

-	# Firing up the engines!
-	kmeansmk1(data, clusters)
+    # For benchmarking starting the timer now
+    start_time = time.time()
+
+    # Firing up the engines!
+    kmeansmk1(data, clusters)
+
+    # Stopping benchmark
+    seconds = time.time() - start_time
+    print(str(seconds) + " seconds for execution")

-	# Stopping benchmark
-	seconds = time.time() - start_time
-	print(str(seconds) + " seconds for execution")

 # Start the algorithm and generate test data
-data = dmtest.numGen(10000, 2, 5, True)
+data = dmtest.numGen(10000, 10, 5, True)
 startup(data)
--- a/src/algorithms/kmeansMkI_2d.py
+++ b/src/algorithms/kmeansMkI_2d.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python
-# title:				kmeansMkI.py
-# description:		Our personal Python K-Means++ implementation
-# author:			Tillmann Brendel, Conrad Großer
-# license:			Pending
-# date:				04.06.2018
-# version:			1.5
-# usage:				python pyscript.py
+# title:                kmeansMkI.py
+# description:      Our personal Python K-Means++ implementation
+# author:           Tillmann Brendel, Conrad Großer
+# license:          Pending
+# date:             04.06.2018
+# version:          1.6
+# usage:                python pyscript.py
 # notes:
 # known_issues:
-# python_version:	3.x
+# python_version:   3.x
 # ==============================================================================

 # IMPORTS
@@ -31,79 +31,81 @@ import matplotlib.pyplot as plt
 import dmlib
 import dmtest

-# CODE
+
 # Main function of the algorithm
 def kmeansmk1(xdata, ydata, clusters):
    # Defining cluster points
    for i in range(0, clusters):
        globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
        print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
-    #get max data in the data arrays
-    highpointx = dmlib.findHighest(xdata)
-    highpointy = dmlib.findHighest(ydata)
-    #print('highpoinx: ' + str(highpointx))
-    #print('highpointy: ' + str(highpointy))
+
+    # Get the maximum of the data
+    highpointx = max(xdata)
+    highpointy = max(ydata)

    # Define variables for running the algorithm (runs is just as important as every other variable)
-    done = 0
+    done = False
    runs = 0

-    # As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters
-    while done == 0:
-        runs = runs + 1
+    # As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters
+    while not done:
+        runs += 1
        assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
-        #assigned_points consists of the clusternumbers
+        # assigned_points consists of the clusternumbers
        done = calcClusters(xdata, ydata, assigned_points, clusters)

    for i in range(0, clusters):
        print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
    for i in range(0, clusters):
        plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
+
    plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
    plt.show()

+
 # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
 def calcClusters(xdata, ydata, assigned_points, clusters):
    for cluster in range(0, clusters):
-        cpointunchanged = 1
+        cpointunchanged = True
        globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
        clustersumx = 0
        clustersumy = 0
        count = 0
-        #print('calcclusters running')
+
        for item in range(0, len(xdata)):
            if assigned_points[item] == cluster:
                clustersumx = clustersumx + int(xdata[item])
                clustersumy = clustersumy + int(ydata[item])
                count = count + 1
-           # print('item ' + str(item) +'done')
+
        globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
-        #print('cluster ' + str(cluster) + 'done')
+
        # checking if old clusterpoint is equal to the one just calculated
        if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
-            cpointunchanged = 0
+            cpointunchanged = False

    return cpointunchanged

+
 def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
    data_assigned = []
    assigned_cluster = 0
-    resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy])
-    #print('resetdist =' + str(resetdist))
+    resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy])
+
    for item in range(0, len(xdata)):
        olddistance = resetdist
        for cluster in range(0, clusters):
            distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
-           # print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
+
            if distance < olddistance:
                olddistance = distance
                assigned_cluster = cluster
-       # print('cluster number ' + str(cluster) + ' assigned')
+
        data_assigned.append(assigned_cluster)
-    # Add the assigned values list to the new_data array
-    # new_data.append(data_assigned)
+
    return data_assigned

+
 # Startup function for collecting necesarry xdata
 def startup(xdata, ydata):
    # Using two clusters for testing
@@ -121,6 +123,7 @@ def startup(xdata, ydata):
    seconds = time.time() - start_time
    print(str(seconds) + " seconds for execution")

+
 # Start the algorithm and generate test xdata
 xdata = dmtest.numGenLight(10000, False, 5)
 ydata = dmtest.numGenLight(10000, False, 2)