Reworked Code

This commit is contained in:
2019-04-25 12:06:06 +02:00
parent 7d31468c43
commit 37daadb020
4 changed files with 178 additions and 172 deletions

View File

@@ -1,5 +1,7 @@
# Imports for dmlib
import math
# Calculate the difference between two points giving the indexes of these xdata entries
def calcdiff(point1, point2):
if int(point2) > int(point1):
@@ -8,23 +10,10 @@ def calcdiff(point1, point2):
difference = int(point1) - int(point2)
return difference
# Calculate the difference between two points in 2D space
def calcdiff2d(point1, point2):
point1 = [int(i) for i in point1]
point2 = [int(i) for i in point2]
difference = math.sqrt(((point2[0]) - (point1[0])) ** 2 + ((point2[1]) - (point1[1])) ** 2)
return betrag(difference)
# Get the absolute value of a number and returns it as int
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Determine the highest int value in an array and returns is as an int
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum
return abs(difference)

View File

@@ -1,55 +1,64 @@
# For random generation of numbers import randint
from random import randint, shuffle
# Simple generator for test nums (40-40-20 biased), returns 1D array of nums
def numGenLight(entries, shuffle, num_lenght):
dataArray = []
for i in range(0, int(entries)):
if i < round(entries * 0.4):
num = generateNumber(num_lenght, 2)
elif i >= round(entries * 0.4) and i < round(entries * 0.6):
num = generateNumber(num_lenght, 9)
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
num = generateNumber(num_lenght, 4)
else:
num = generateNumber(num_lenght, randint(0,9))
dataArray.append(num)
if shuffle:
shuffle(dataArray)
return dataArray
dataArray = []
for i in range(0, int(entries)):
if i < round(entries * 0.4):
num = generateNumber(num_lenght, 2)
elif i >= round(entries * 0.4) and i < round(entries * 0.6):
num = generateNumber(num_lenght, 9)
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
num = generateNumber(num_lenght, 4)
else:
num = generateNumber(num_lenght, randint(0, 9))
dataArray.append(num)
if shuffle:
shuffle(dataArray)
return dataArray
# Function for generating the content of one single row randomly
def generateNumber(numberLenght, startingNumber):
number = str(startingNumber)
for length in range(0, numberLenght - 1):
number = number + str(randint(0,9))
return number
number = str(startingNumber)
for length in range(0, numberLenght - 1):
number = number + str(randint(0, 9))
return number
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
# Function for writing data into a file
# content = string, nameChunkStart and namePartStart are for better naming
# /testdata/ folder has to be created at this point
def writeFile(content, nameChunkStart, namePartStart):
filenumber = int(nameChunkStart) + int(namePartStart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")
filenumber = int(nameChunkStart) + int(namePartStart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")
# Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters
def numGen(entries, cluster, int_lenght, suffle_value):
dataArray = []
clusterArray = []
dataArray = []
clusterArray = []
for cluster_num in range(0, cluster):
clusterArray.append(randint(10,99))
for cluster_num in range(0, cluster):
clusterArray.append(randint(10, 99))
for item in range(0, entries):
decider = randint(0, 2)
if decider == 2:
dataArray.append(generateNumber(int_lenght, randint(1,9)))
else:
cluster_decider = randint(0, cluster - 1)
dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
for item in range(0, entries):
decider = randint(0, 2)
if decider == 2:
dataArray.append(generateNumber(int_lenght, randint(1, 9)))
else:
cluster_decider = randint(0, cluster - 1)
dataArray.append(
generateNumber(
int_lenght - 1,
clusterArray[cluster_decider]
))
if suffle_value:
shuffle(dataArray)
if suffle_value:
shuffle(dataArray)
return dataArray
return dataArray

View File

@@ -1,16 +1,16 @@
#!/usr/bin/env python
#title: kmeansMkI.py
#description: Our personal Python K-Means++ implementation
#author: Tillmann Brendel, Conrad Großer
#license: Pending
#date: 26.05.2018
#version: 1.2
#usage: python pyscript.py
#notes:
#dependencies: mathplotlib
#known_issues: When clusters are 'thin' or noice is to strong --> unaccurate
#python_version: 3.x
#==============================================================================
# title: kmeansMkI.py
# description: Our personal Python K-Means++ implementation
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 26.05.2018
# version: 1.3
# usage: python pyscript.py
# notes:
# dependencies: matplotlib
# known_issues: When clusters are 'thin' or noice is to strong --> inaccurate
# python_version: 3.x
# ==============================================================================
# IMPORTS
@@ -28,112 +28,117 @@ import matplotlib.pyplot as plt
import dmlib
import dmtest
# CODE
# Main function of the algorithm
def kmeansmk1(data, clusters):
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Get max value in the data array
highPoint = dmlib.findHighest(data)
# Get max value in the data array
highPoint = max(data)
# Define variables for running the algorithm (runs is just for benchmarking!)
done = 0
runs = 0
# Define variables for running the algorithm (runs is just for benchmarking!)
done = False
runs = 0
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while done == 0:
runs = runs + 1
new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters)
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while not done:
runs += 1
new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters)
# Printing final clusters
for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Printing final clusters
for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
anew = []
inew = 0
while inew < len(data):
anew.append(inew)
inew = inew + 1
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
anew = []
inew = 0
while inew < len(data):
anew.append(inew)
inew = inew + 1
# Drawing found clusters as lines
for i in range(0, clusters):
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
# Drawing found clusters as lines
for i in range(0, clusters):
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
# Showing graph
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
plt.show()
# Showing graph
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
plt.show()
return 0
return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters):
changed = 0
for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)]
# Sum of the cluster to calculate average difference between cluster center and data points
clustersum = 0
item_count = 0
changed = False
for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)]
# Sum of the cluster to calculate average difference between cluster center and data points
clustersum = 0
item_count = 0
for item in range(0, len(data[0])):
if data[1][item] == globals()["cpoint_" + str(cluster)]:
clustersum = clustersum + int(data[0][item])
item_count = item_count + 1
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
for item in range(0, len(data[0])):
if data[1][item] == globals()["cpoint_" + str(cluster)]:
clustersum = clustersum + int(data[0][item])
item_count = item_count + 1
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
# Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = 1
# Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = True
return changed
return changed
def assignCluster(data, highPoint, clusters):
# Create a new data array for working
new_data = []
new_data.append(data)
# Create a new data array for working
new_data = []
new_data.append(data)
# Create new array for assigned clusters of each value
data_assigned = []
# Create new array for assigned clusters of each value
data_assigned = []
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, len(new_data[0])):
# Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, len(new_data[0])):
# Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters):
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array
new_data.append(data_assigned)
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters):
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array
new_data.append(data_assigned)
return new_data
return new_data
# Startup function for collecting necesarry data
def startup(data):
# Using two clusters for testing
clusters = int(input("How many clusters are known? "))
# cores = input("How many cores should be used? ")
# path = input("Where is the data? ") or in this case data
# For benchmarking starting the timer now
start_time = time.time()
# Using two clusters for testing
clusters = int(input("How many clusters are known? "))
# cores = input("How many cores should be used? ")
# path = input("Where is the data? ") or in this case data
# Firing up the engines!
kmeansmk1(data, clusters)
# For benchmarking starting the timer now
start_time = time.time()
# Firing up the engines!
kmeansmk1(data, clusters)
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test data
data = dmtest.numGen(10000, 2, 5, True)
data = dmtest.numGen(10000, 10, 5, True)
startup(data)

View File

@@ -1,14 +1,14 @@
#!/usr/bin/env python
# title: kmeansMkI.py
# description: Our personal Python K-Means++ implementation
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 04.06.2018
# version: 1.5
# usage: python pyscript.py
# title: kmeansMkI.py
# description: Our personal Python K-Means++ implementation
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 04.06.2018
# version: 1.6
# usage: python pyscript.py
# notes:
# known_issues:
# python_version: 3.x
# python_version: 3.x
# ==============================================================================
# IMPORTS
@@ -31,79 +31,81 @@ import matplotlib.pyplot as plt
import dmlib
import dmtest
# CODE
# Main function of the algorithm
def kmeansmk1(xdata, ydata, clusters):
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
#get max data in the data arrays
highpointx = dmlib.findHighest(xdata)
highpointy = dmlib.findHighest(ydata)
#print('highpoinx: ' + str(highpointx))
#print('highpointy: ' + str(highpointy))
# Get the maximum of the data
highpointx = max(xdata)
highpointy = max(ydata)
# Define variables for running the algorithm (runs is just as important as every other variable)
done = 0
done = False
runs = 0
# As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters
while done == 0:
runs = runs + 1
# As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters
while not done:
runs += 1
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
#assigned_points consists of the clusternumbers
# assigned_points consists of the clusternumbers
done = calcClusters(xdata, ydata, assigned_points, clusters)
for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
for i in range(0, clusters):
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
plt.show()
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(xdata, ydata, assigned_points, clusters):
for cluster in range(0, clusters):
cpointunchanged = 1
cpointunchanged = True
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
clustersumx = 0
clustersumy = 0
count = 0
#print('calcclusters running')
for item in range(0, len(xdata)):
if assigned_points[item] == cluster:
clustersumx = clustersumx + int(xdata[item])
clustersumy = clustersumy + int(ydata[item])
count = count + 1
# print('item ' + str(item) +'done')
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
#print('cluster ' + str(cluster) + 'done')
# checking if old clusterpoint is equal to the one just calculated
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
cpointunchanged = 0
cpointunchanged = False
return cpointunchanged
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
data_assigned = []
assigned_cluster = 0
resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy])
#print('resetdist =' + str(resetdist))
resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy])
for item in range(0, len(xdata)):
olddistance = resetdist
for cluster in range(0, clusters):
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
# print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
if distance < olddistance:
olddistance = distance
assigned_cluster = cluster
# print('cluster number ' + str(cluster) + ' assigned')
data_assigned.append(assigned_cluster)
# Add the assigned values list to the new_data array
# new_data.append(data_assigned)
return data_assigned
# Startup function for collecting necesarry xdata
def startup(xdata, ydata):
# Using two clusters for testing
@@ -121,6 +123,7 @@ def startup(xdata, ydata):
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test xdata
xdata = dmtest.numGenLight(10000, False, 5)
ydata = dmtest.numGenLight(10000, False, 2)