kmeans Update 0.2

- Added documentation
- Added new array for data savings

Co-Authored-By: tchemn <tchemn@users.noreply.github.com>
This commit is contained in:
2018-05-26 21:25:57 +02:00
parent c6c73122b0
commit 5f9143b604

View File

@@ -3,7 +3,7 @@
#description: Our personal Python K-Means++ implementation #description: Our personal Python K-Means++ implementation
#author: Tillmann Brendel, Conrad Großer #author: Tillmann Brendel, Conrad Großer
#date: 26.05.2018 #date: 26.05.2018
#version: 0.1 #version: 0.2
#usage: python pyscript.py #usage: python pyscript.py
#notes: #notes:
#known_issues: #known_issues:
@@ -25,29 +25,50 @@ import multiprocessing
# CODE # CODE
# Main function of the algorithm # Main function of the algorithm
def kmeansmk1(data): def kmeansmk1(data):
# Using two clusters for testing
clusters = 2 clusters = 2
data_size = len(data)
# Create a new data array for working
new_data = []
new_data.append(data)
# Get the size of the data array
data_size = len(new_data[0])
# Defining cluster points # Defining cluster points
for i in range(0, clusters): for i in range(0, clusters):
globals()["cpoint_" + str(i)] = randint(0, data_size) globals()["cpoint_" + str(i)] = randint(0, data_size)
print("Cluster " + str(i) + ": " + str(data[globals()["cpoint_" + str(i)]])) print("Cluster " + str(i) + ": " + str(new_data[0][globals()["cpoint_" + str(i)]]))
# Create new array for assigned clusters of each value
data_assigned = [] data_assigned = []
highPoint = findHighest(data)
# Get max value in the data array
highPoint = findHighest(new_data[0])
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, data_size): for item in range(0, data_size):
# Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint min_cluster = highPoint
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters): for cluster in range(0, clusters):
clusternumber = globals()["cpoint_" + str(cluster)] clusternumber = globals()["cpoint_" + str(cluster)]
if min_cluster > calcdiff(item, clusternumber, data): if min_cluster > calcdiff(item, clusternumber, new_data[0]):
min_cluster = calcdiff(item, clusternumber, data) min_cluster = calcdiff(item, clusternumber, new_data[0])
assinged_cluster = clusternumber assinged_cluster = clusternumber
# Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster) data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array
new_data.append(data_assigned)
# for item in range(0, data_size): # Print out the list of datapoints and assigned clusters
# print("Datapoint: " + str(data[item]) + " | Assigned cluster: " + str(data[data_assigned[item]])) for item in range(0, len(new_data[0])):
print("Datapoint: " + str(new_data[0][item]) + " | Assigned cluster: " + str(new_data[0][new_data[1][item]]))
return new_data
# Determine the highest int value in an array
def findHighest(data): def findHighest(data):
maximum = 0 maximum = 0
for i in range(0, len(data)): for i in range(0, len(data)):
@@ -55,6 +76,7 @@ def findHighest(data):
maximum = int(data[i]) maximum = int(data[i])
return maximum return maximum
# Calculate the difference between two points giving the indexes of these data entries
def calcdiff(point1, point2, data): def calcdiff(point1, point2, data):
if int(data[point2]) > int(data[point1]): if int(data[point2]) > int(data[point1]):
difference = int(data[point2]) - int(data[point1]) difference = int(data[point2]) - int(data[point1])
@@ -63,10 +85,11 @@ def calcdiff(point1, point2, data):
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) # print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
return betrag(difference) return betrag(difference)
def betrag(zahl): # Get the absolute value of a number
if zahl < 0: def betrag(number):
zahl = int((-2 * zahl) / 2) if number < 0:
return zahl number = int((-2 * number) / 2)
return number
# Startup function for collecting necesarry data # Startup function for collecting necesarry data
def startup(data): def startup(data):
@@ -79,7 +102,7 @@ def startup(data):
# Firing up the engines! # Firing up the engines!
kmeansmk1(data) kmeansmk1(data)
# kmeansmk1(clusters, cores, path) # kmeansmk1(clusters, cores, data)
# Stopping benchmark # Stopping benchmark
seconds = time.time() - start_time seconds = time.time() - start_time
@@ -111,5 +134,6 @@ def generatePLZ(start):
plz = plz + str(randint(0,9)) plz = plz + str(randint(0,9))
return plz return plz
# Start the algorithm and generate test data
data = testgenerator() data = testgenerator()
startup(data) startup(data)