Reworked Code

This commit is contained in:
2019-04-25 12:06:06 +02:00
parent 7d31468c43
commit 37daadb020
4 changed files with 178 additions and 172 deletions

View File

@@ -1,5 +1,7 @@
# Imports for dmlib
import math
# Calculate the difference between two points giving the indexes of these xdata entries
def calcdiff(point1, point2):
if int(point2) > int(point1):
@@ -8,23 +10,10 @@ def calcdiff(point1, point2):
difference = int(point1) - int(point2)
return difference
# Calculate the difference between two points in 2D space
def calcdiff2d(point1, point2):
point1 = [int(i) for i in point1]
point2 = [int(i) for i in point2]
difference = math.sqrt(((point2[0]) - (point1[0])) ** 2 + ((point2[1]) - (point1[1])) ** 2)
return betrag(difference)
# Get the absolute value of a number and returns it as int
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Determine the highest int value in an array and returns is as an int
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum
return abs(difference)

View File

@@ -1,6 +1,7 @@
# For random generation of numbers import randint
from random import randint, shuffle
# Simple generator for test nums (40-40-20 biased), returns 1D array of nums
def numGenLight(entries, shuffle, num_lenght):
dataArray = []
@@ -18,6 +19,7 @@ def numGenLight(entries, shuffle, num_lenght):
shuffle(dataArray)
return dataArray
# Function for generating the content of one single row randomly
def generateNumber(numberLenght, startingNumber):
number = str(startingNumber)
@@ -25,7 +27,9 @@ def generateNumber(numberLenght, startingNumber):
number = number + str(randint(0, 9))
return number
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
# Function for writing data into a file
# content = string, nameChunkStart and namePartStart are for better naming
# /testdata/ folder has to be created at this point
def writeFile(content, nameChunkStart, namePartStart):
filenumber = int(nameChunkStart) + int(namePartStart)
@@ -33,6 +37,7 @@ def writeFile(content, nameChunkStart, namePartStart):
for w in range(0, len(content)):
file.write(content[w] + "\n")
# Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters
def numGen(entries, cluster, int_lenght, suffle_value):
dataArray = []
@@ -47,7 +52,11 @@ def numGen(entries, cluster, int_lenght, suffle_value):
dataArray.append(generateNumber(int_lenght, randint(1, 9)))
else:
cluster_decider = randint(0, cluster - 1)
dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
dataArray.append(
generateNumber(
int_lenght - 1,
clusterArray[cluster_decider]
))
if suffle_value:
shuffle(dataArray)

View File

@@ -4,11 +4,11 @@
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 26.05.2018
#version: 1.2
# version: 1.3
# usage: python pyscript.py
# notes:
#dependencies: mathplotlib
#known_issues: When clusters are 'thin' or noice is to strong --> unaccurate
# dependencies: matplotlib
# known_issues: When clusters are 'thin' or noice is to strong --> inaccurate
# python_version: 3.x
# ==============================================================================
@@ -28,6 +28,7 @@ import matplotlib.pyplot as plt
import dmlib
import dmtest
# CODE
# Main function of the algorithm
def kmeansmk1(data, clusters):
@@ -37,15 +38,15 @@ def kmeansmk1(data, clusters):
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Get max value in the data array
highPoint = dmlib.findHighest(data)
highPoint = max(data)
# Define variables for running the algorithm (runs is just for benchmarking!)
done = 0
done = False
runs = 0
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while done == 0:
runs = runs + 1
while not done:
runs += 1
new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters)
@@ -70,9 +71,10 @@ def kmeansmk1(data, clusters):
return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters):
changed = 0
changed = False
for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)]
@@ -88,10 +90,11 @@ def calcClusters(data, clusters):
# Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = 1
changed = True
return changed
def assignCluster(data, highPoint, clusters):
# Create a new data array for working
new_data = []
@@ -117,6 +120,7 @@ def assignCluster(data, highPoint, clusters):
return new_data
# Startup function for collecting necesarry data
def startup(data):
# Using two clusters for testing
@@ -134,6 +138,7 @@ def startup(data):
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test data
data = dmtest.numGen(10000, 2, 5, True)
data = dmtest.numGen(10000, 10, 5, True)
startup(data)

View File

@@ -4,7 +4,7 @@
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 04.06.2018
# version: 1.5
# version: 1.6
# usage: python pyscript.py
# notes:
# known_issues:
@@ -31,26 +31,25 @@ import matplotlib.pyplot as plt
import dmlib
import dmtest
# CODE
# Main function of the algorithm
def kmeansmk1(xdata, ydata, clusters):
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
#get max data in the data arrays
highpointx = dmlib.findHighest(xdata)
highpointy = dmlib.findHighest(ydata)
#print('highpoinx: ' + str(highpointx))
#print('highpointy: ' + str(highpointy))
# Get the maximum of the data
highpointx = max(xdata)
highpointy = max(ydata)
# Define variables for running the algorithm (runs is just as important as every other variable)
done = 0
done = False
runs = 0
# As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters
while done == 0:
runs = runs + 1
# As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters
while not done:
runs += 1
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
# assigned_points consists of the clusternumbers
done = calcClusters(xdata, ydata, assigned_points, clusters)
@@ -59,51 +58,54 @@ def kmeansmk1(xdata, ydata, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
for i in range(0, clusters):
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
plt.show()
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(xdata, ydata, assigned_points, clusters):
for cluster in range(0, clusters):
cpointunchanged = 1
cpointunchanged = True
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
clustersumx = 0
clustersumy = 0
count = 0
#print('calcclusters running')
for item in range(0, len(xdata)):
if assigned_points[item] == cluster:
clustersumx = clustersumx + int(xdata[item])
clustersumy = clustersumy + int(ydata[item])
count = count + 1
# print('item ' + str(item) +'done')
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
#print('cluster ' + str(cluster) + 'done')
# checking if old clusterpoint is equal to the one just calculated
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
cpointunchanged = 0
cpointunchanged = False
return cpointunchanged
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
data_assigned = []
assigned_cluster = 0
resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy])
#print('resetdist =' + str(resetdist))
for item in range(0, len(xdata)):
olddistance = resetdist
for cluster in range(0, clusters):
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
# print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
if distance < olddistance:
olddistance = distance
assigned_cluster = cluster
# print('cluster number ' + str(cluster) + ' assigned')
data_assigned.append(assigned_cluster)
# Add the assigned values list to the new_data array
# new_data.append(data_assigned)
return data_assigned
# Startup function for collecting necesarry xdata
def startup(xdata, ydata):
# Using two clusters for testing
@@ -121,6 +123,7 @@ def startup(xdata, ydata):
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test xdata
xdata = dmtest.numGenLight(10000, False, 5)
ydata = dmtest.numGenLight(10000, False, 2)