Reworked Code

This commit is contained in:
2019-04-25 12:06:06 +02:00
parent 7d31468c43
commit 37daadb020
4 changed files with 178 additions and 172 deletions

View File

@@ -1,5 +1,7 @@
# Imports for dmlib
import math import math
# Calculate the difference between two points giving the indexes of these xdata entries # Calculate the difference between two points giving the indexes of these xdata entries
def calcdiff(point1, point2): def calcdiff(point1, point2):
if int(point2) > int(point1): if int(point2) > int(point1):
@@ -8,23 +10,10 @@ def calcdiff(point1, point2):
difference = int(point1) - int(point2) difference = int(point1) - int(point2)
return difference return difference
# Calculate the difference between two points in 2D space # Calculate the difference between two points in 2D space
def calcdiff2d(point1, point2): def calcdiff2d(point1, point2):
point1 = [int(i) for i in point1] point1 = [int(i) for i in point1]
point2 = [int(i) for i in point2] point2 = [int(i) for i in point2]
difference = math.sqrt(((point2[0]) - (point1[0])) ** 2 + ((point2[1]) - (point1[1])) ** 2) difference = math.sqrt(((point2[0]) - (point1[0])) ** 2 + ((point2[1]) - (point1[1])) ** 2)
return betrag(difference) return abs(difference)
# Get the absolute value of a number and returns it as int
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Determine the highest int value in an array and returns is as an int
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum

View File

@@ -1,55 +1,64 @@
# For random generation of numbers import randint # For random generation of numbers import randint
from random import randint, shuffle from random import randint, shuffle
# Simple generator for test nums (40-40-20 biased), returns 1D array of nums # Simple generator for test nums (40-40-20 biased), returns 1D array of nums
def numGenLight(entries, shuffle, num_lenght): def numGenLight(entries, shuffle, num_lenght):
dataArray = [] dataArray = []
for i in range(0, int(entries)): for i in range(0, int(entries)):
if i < round(entries * 0.4): if i < round(entries * 0.4):
num = generateNumber(num_lenght, 2) num = generateNumber(num_lenght, 2)
elif i >= round(entries * 0.4) and i < round(entries * 0.6): elif i >= round(entries * 0.4) and i < round(entries * 0.6):
num = generateNumber(num_lenght, 9) num = generateNumber(num_lenght, 9)
elif i >= round(entries * 0.6) and i < round(entries * 0.9): elif i >= round(entries * 0.6) and i < round(entries * 0.9):
num = generateNumber(num_lenght, 4) num = generateNumber(num_lenght, 4)
else: else:
num = generateNumber(num_lenght, randint(0,9)) num = generateNumber(num_lenght, randint(0, 9))
dataArray.append(num) dataArray.append(num)
if shuffle: if shuffle:
shuffle(dataArray) shuffle(dataArray)
return dataArray return dataArray
# Function for generating the content of one single row randomly # Function for generating the content of one single row randomly
def generateNumber(numberLenght, startingNumber): def generateNumber(numberLenght, startingNumber):
number = str(startingNumber) number = str(startingNumber)
for length in range(0, numberLenght - 1): for length in range(0, numberLenght - 1):
number = number + str(randint(0,9)) number = number + str(randint(0, 9))
return number return number
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
# Function for writing data into a file
# content = string, nameChunkStart and namePartStart are for better naming
# /testdata/ folder has to be created at this point # /testdata/ folder has to be created at this point
def writeFile(content, nameChunkStart, namePartStart): def writeFile(content, nameChunkStart, namePartStart):
filenumber = int(nameChunkStart) + int(namePartStart) filenumber = int(nameChunkStart) + int(namePartStart)
file = open("testdata/file" + str(filenumber) + ".txt", "w") file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)): for w in range(0, len(content)):
file.write(content[w] + "\n") file.write(content[w] + "\n")
# Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters # Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters
def numGen(entries, cluster, int_lenght, suffle_value): def numGen(entries, cluster, int_lenght, suffle_value):
dataArray = [] dataArray = []
clusterArray = [] clusterArray = []
for cluster_num in range(0, cluster): for cluster_num in range(0, cluster):
clusterArray.append(randint(10,99)) clusterArray.append(randint(10, 99))
for item in range(0, entries): for item in range(0, entries):
decider = randint(0, 2) decider = randint(0, 2)
if decider == 2: if decider == 2:
dataArray.append(generateNumber(int_lenght, randint(1,9))) dataArray.append(generateNumber(int_lenght, randint(1, 9)))
else: else:
cluster_decider = randint(0, cluster - 1) cluster_decider = randint(0, cluster - 1)
dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider])) dataArray.append(
generateNumber(
int_lenght - 1,
clusterArray[cluster_decider]
))
if suffle_value: if suffle_value:
shuffle(dataArray) shuffle(dataArray)
return dataArray return dataArray

View File

@@ -1,16 +1,16 @@
#!/usr/bin/env python #!/usr/bin/env python
#title: kmeansMkI.py # title: kmeansMkI.py
#description: Our personal Python K-Means++ implementation # description: Our personal Python K-Means++ implementation
#author: Tillmann Brendel, Conrad Großer # author: Tillmann Brendel, Conrad Großer
#license: Pending # license: Pending
#date: 26.05.2018 # date: 26.05.2018
#version: 1.2 # version: 1.3
#usage: python pyscript.py # usage: python pyscript.py
#notes: # notes:
#dependencies: mathplotlib # dependencies: matplotlib
#known_issues: When clusters are 'thin' or noice is to strong --> unaccurate # known_issues: When clusters are 'thin' or noice is to strong --> inaccurate
#python_version: 3.x # python_version: 3.x
#============================================================================== # ==============================================================================
# IMPORTS # IMPORTS
@@ -28,112 +28,117 @@ import matplotlib.pyplot as plt
import dmlib import dmlib
import dmtest import dmtest
# CODE # CODE
# Main function of the algorithm # Main function of the algorithm
def kmeansmk1(data, clusters): def kmeansmk1(data, clusters):
# Defining cluster points # Defining cluster points
for i in range(0, clusters): for i in range(0, clusters):
globals()["cpoint_" + str(i)] = data[randint(0, len(data))] globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Get max value in the data array # Get max value in the data array
highPoint = dmlib.findHighest(data) highPoint = max(data)
# Define variables for running the algorithm (runs is just for benchmarking!) # Define variables for running the algorithm (runs is just for benchmarking!)
done = 0 done = False
runs = 0 runs = 0
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while done == 0: while not done:
runs = runs + 1 runs += 1
new_data = assignCluster(data, highPoint, clusters) new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters) done = calcClusters(new_data, clusters)
# Printing final clusters # Printing final clusters
for i in range(0, clusters): for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
anew = [] anew = []
inew = 0 inew = 0
while inew < len(data): while inew < len(data):
anew.append(inew) anew.append(inew)
inew = inew + 1 inew = inew + 1
# Drawing found clusters as lines # Drawing found clusters as lines
for i in range(0, clusters): for i in range(0, clusters):
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r') plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
# Showing graph # Showing graph
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k') plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
plt.show() plt.show()
return 0
return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters): def calcClusters(data, clusters):
changed = 0 changed = False
for cluster in range(0, clusters): for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable # Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)] prev_cluster = globals()["cpoint_" + str(cluster)]
# Sum of the cluster to calculate average difference between cluster center and data points # Sum of the cluster to calculate average difference between cluster center and data points
clustersum = 0 clustersum = 0
item_count = 0 item_count = 0
for item in range(0, len(data[0])): for item in range(0, len(data[0])):
if data[1][item] == globals()["cpoint_" + str(cluster)]: if data[1][item] == globals()["cpoint_" + str(cluster)]:
clustersum = clustersum + int(data[0][item]) clustersum = clustersum + int(data[0][item])
item_count = item_count + 1 item_count = item_count + 1
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count) globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
# Checking if previous clusterpoint is equal to the one just calculated # Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]: if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = 1 changed = True
return changed
return changed
def assignCluster(data, highPoint, clusters): def assignCluster(data, highPoint, clusters):
# Create a new data array for working # Create a new data array for working
new_data = [] new_data = []
new_data.append(data) new_data.append(data)
# Create new array for assigned clusters of each value # Create new array for assigned clusters of each value
data_assigned = [] data_assigned = []
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, len(new_data[0])): for item in range(0, len(new_data[0])):
# Set the minimal cluster difference to the highest difference in the list to ease comparision # Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint min_cluster = highPoint
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters): for cluster in range(0, clusters):
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]): if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]) min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
assinged_cluster = globals()["cpoint_" + str(cluster)] assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data # Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster) data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array # Add the assigned values list to the new_data array
new_data.append(data_assigned) new_data.append(data_assigned)
return new_data
return new_data
# Startup function for collecting necesarry data # Startup function for collecting necesarry data
def startup(data): def startup(data):
# Using two clusters for testing # Using two clusters for testing
clusters = int(input("How many clusters are known? ")) clusters = int(input("How many clusters are known? "))
# cores = input("How many cores should be used? ") # cores = input("How many cores should be used? ")
# path = input("Where is the data? ") or in this case data # path = input("Where is the data? ") or in this case data
# For benchmarking starting the timer now
start_time = time.time()
# Firing up the engines! # For benchmarking starting the timer now
kmeansmk1(data, clusters) start_time = time.time()
# Firing up the engines!
kmeansmk1(data, clusters)
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test data # Start the algorithm and generate test data
data = dmtest.numGen(10000, 2, 5, True) data = dmtest.numGen(10000, 10, 5, True)
startup(data) startup(data)

View File

@@ -1,14 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
# title: kmeansMkI.py # title: kmeansMkI.py
# description: Our personal Python K-Means++ implementation # description: Our personal Python K-Means++ implementation
# author: Tillmann Brendel, Conrad Großer # author: Tillmann Brendel, Conrad Großer
# license: Pending # license: Pending
# date: 04.06.2018 # date: 04.06.2018
# version: 1.5 # version: 1.6
# usage: python pyscript.py # usage: python pyscript.py
# notes: # notes:
# known_issues: # known_issues:
# python_version: 3.x # python_version: 3.x
# ============================================================================== # ==============================================================================
# IMPORTS # IMPORTS
@@ -31,79 +31,81 @@ import matplotlib.pyplot as plt
import dmlib import dmlib
import dmtest import dmtest
# CODE
# Main function of the algorithm # Main function of the algorithm
def kmeansmk1(xdata, ydata, clusters): def kmeansmk1(xdata, ydata, clusters):
# Defining cluster points # Defining cluster points
for i in range(0, clusters): for i in range(0, clusters):
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]] globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
#get max data in the data arrays
highpointx = dmlib.findHighest(xdata) # Get the maximum of the data
highpointy = dmlib.findHighest(ydata) highpointx = max(xdata)
#print('highpoinx: ' + str(highpointx)) highpointy = max(ydata)
#print('highpointy: ' + str(highpointy))
# Define variables for running the algorithm (runs is just as important as every other variable) # Define variables for running the algorithm (runs is just as important as every other variable)
done = 0 done = False
runs = 0 runs = 0
# As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters # As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters
while done == 0: while not done:
runs = runs + 1 runs += 1
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy) assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
#assigned_points consists of the clusternumbers # assigned_points consists of the clusternumbers
done = calcClusters(xdata, ydata, assigned_points, clusters) done = calcClusters(xdata, ydata, assigned_points, clusters)
for i in range(0, clusters): for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
for i in range(0, clusters): for i in range(0, clusters):
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro') plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k') plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
plt.show() plt.show()
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(xdata, ydata, assigned_points, clusters): def calcClusters(xdata, ydata, assigned_points, clusters):
for cluster in range(0, clusters): for cluster in range(0, clusters):
cpointunchanged = 1 cpointunchanged = True
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)] globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
clustersumx = 0 clustersumx = 0
clustersumy = 0 clustersumy = 0
count = 0 count = 0
#print('calcclusters running')
for item in range(0, len(xdata)): for item in range(0, len(xdata)):
if assigned_points[item] == cluster: if assigned_points[item] == cluster:
clustersumx = clustersumx + int(xdata[item]) clustersumx = clustersumx + int(xdata[item])
clustersumy = clustersumy + int(ydata[item]) clustersumy = clustersumy + int(ydata[item])
count = count + 1 count = count + 1
# print('item ' + str(item) +'done')
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)] globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
#print('cluster ' + str(cluster) + 'done')
# checking if old clusterpoint is equal to the one just calculated # checking if old clusterpoint is equal to the one just calculated
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]: if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
cpointunchanged = 0 cpointunchanged = False
return cpointunchanged return cpointunchanged
def assignCluster(xdata, ydata, clusters, highpointx, highpointy): def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
data_assigned = [] data_assigned = []
assigned_cluster = 0 assigned_cluster = 0
resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy]) resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy])
#print('resetdist =' + str(resetdist))
for item in range(0, len(xdata)): for item in range(0, len(xdata)):
olddistance = resetdist olddistance = resetdist
for cluster in range(0, clusters): for cluster in range(0, clusters):
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]]) distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
# print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
if distance < olddistance: if distance < olddistance:
olddistance = distance olddistance = distance
assigned_cluster = cluster assigned_cluster = cluster
# print('cluster number ' + str(cluster) + ' assigned')
data_assigned.append(assigned_cluster) data_assigned.append(assigned_cluster)
# Add the assigned values list to the new_data array
# new_data.append(data_assigned)
return data_assigned return data_assigned
# Startup function for collecting necesarry xdata # Startup function for collecting necesarry xdata
def startup(xdata, ydata): def startup(xdata, ydata):
# Using two clusters for testing # Using two clusters for testing
@@ -121,6 +123,7 @@ def startup(xdata, ydata):
seconds = time.time() - start_time seconds = time.time() - start_time
print(str(seconds) + " seconds for execution") print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test xdata # Start the algorithm and generate test xdata
xdata = dmtest.numGenLight(10000, False, 5) xdata = dmtest.numGenLight(10000, False, 5)
ydata = dmtest.numGenLight(10000, False, 2) ydata = dmtest.numGenLight(10000, False, 2)