K-Means Step 1

- Added simple cluster assigning

Co-Authored-By: tchemn <tchemn@users.noreply.github.com>
This commit is contained in:
2018-05-26 20:55:35 +02:00
parent 318b66b1a6
commit c6c73122b0

View File

@@ -24,12 +24,53 @@ import multiprocessing
# CODE # CODE
# Main function of the algorithm # Main function of the algorithm
def kmeansmk1(clusters, data): def kmeansmk1(data):
print("Sorting data into " + str(clusters) + " clusters.") clusters = 2
data_size = len(data)
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = randint(0, data_size)
print("Cluster " + str(i) + ": " + str(data[globals()["cpoint_" + str(i)]]))
data_assigned = []
highPoint = findHighest(data)
for item in range(0, data_size):
min_cluster = highPoint
for cluster in range(0, clusters):
clusternumber = globals()["cpoint_" + str(cluster)]
if min_cluster > calcdiff(item, clusternumber, data):
min_cluster = calcdiff(item, clusternumber, data)
assinged_cluster = clusternumber
data_assigned.append(assinged_cluster)
# for item in range(0, data_size):
# print("Datapoint: " + str(data[item]) + " | Assigned cluster: " + str(data[data_assigned[item]]))
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum
def calcdiff(point1, point2, data):
if int(data[point2]) > int(data[point1]):
difference = int(data[point2]) - int(data[point1])
else:
difference = int(data[point1]) - int(data[point2])
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
return betrag(difference)
def betrag(zahl):
if zahl < 0:
zahl = int((-2 * zahl) / 2)
return zahl
# Startup function for collecting necesarry data # Startup function for collecting necesarry data
def startup(data): def startup(data):
clusters = int(input("How many clusters are known? ")) # clusters = int(input("How many clusters are known? "))
# cores = input("How many cores should be used? ") # cores = input("How many cores should be used? ")
# path = input("Where is the data? ") or in this case data # path = input("Where is the data? ") or in this case data
@@ -37,17 +78,17 @@ def startup(data):
start_time = time.time() start_time = time.time()
# Firing up the engines! # Firing up the engines!
kmeansmk1(clusters, data) kmeansmk1(data)
# kmeansmk1(clusters, cores, path) # kmeansmk1(clusters, cores, path)
# Stopping benchmark # Stopping benchmark
seconds = time.time() - start_time seconds = time.time() - start_time
print(str(seconds) + " seconds for execution") # print(str(seconds) + " seconds for execution")
# Simple generator for test data # Simple generator for test data
def testgenerator(): def testgenerator():
dataArray = [] dataArray = []
for i in range(1,100): for i in range(0,100):
if i <= 20: if i <= 20:
plz = generatePLZ("09") plz = generatePLZ("09")
elif i > 20 and i < 50: elif i > 20 and i < 50: