Reworked Code
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
# Imports for dmlib
|
||||
import math
|
||||
|
||||
|
||||
# Calculate the difference between two points giving the indexes of these xdata entries
|
||||
def calcdiff(point1, point2):
|
||||
if int(point2) > int(point1):
|
||||
@@ -8,23 +10,10 @@ def calcdiff(point1, point2):
|
||||
difference = int(point1) - int(point2)
|
||||
return difference
|
||||
|
||||
|
||||
# Calculate the difference between two points in 2D space
|
||||
def calcdiff2d(point1, point2):
|
||||
point1 = [int(i) for i in point1]
|
||||
point2 = [int(i) for i in point2]
|
||||
difference = math.sqrt(((point2[0]) - (point1[0])) ** 2 + ((point2[1]) - (point1[1])) ** 2)
|
||||
return betrag(difference)
|
||||
|
||||
# Get the absolute value of a number and returns it as int
|
||||
def betrag(number):
|
||||
if number < 0:
|
||||
number = int((-2 * number) / 2)
|
||||
return number
|
||||
|
||||
# Determine the highest int value in an array and returns is as an int
|
||||
def findHighest(data):
|
||||
maximum = 0
|
||||
for i in range(0, len(data)):
|
||||
if int(data[i]) > maximum:
|
||||
maximum = int(data[i])
|
||||
return maximum
|
||||
return abs(difference)
|
||||
|
||||
@@ -1,55 +1,64 @@
|
||||
# For random generation of numbers import randint
|
||||
from random import randint, shuffle
|
||||
|
||||
|
||||
# Simple generator for test nums (40-40-20 biased), returns 1D array of nums
|
||||
def numGenLight(entries, shuffle, num_lenght):
|
||||
dataArray = []
|
||||
for i in range(0, int(entries)):
|
||||
if i < round(entries * 0.4):
|
||||
num = generateNumber(num_lenght, 2)
|
||||
elif i >= round(entries * 0.4) and i < round(entries * 0.6):
|
||||
num = generateNumber(num_lenght, 9)
|
||||
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
|
||||
num = generateNumber(num_lenght, 4)
|
||||
else:
|
||||
num = generateNumber(num_lenght, randint(0,9))
|
||||
dataArray.append(num)
|
||||
if shuffle:
|
||||
shuffle(dataArray)
|
||||
return dataArray
|
||||
dataArray = []
|
||||
for i in range(0, int(entries)):
|
||||
if i < round(entries * 0.4):
|
||||
num = generateNumber(num_lenght, 2)
|
||||
elif i >= round(entries * 0.4) and i < round(entries * 0.6):
|
||||
num = generateNumber(num_lenght, 9)
|
||||
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
|
||||
num = generateNumber(num_lenght, 4)
|
||||
else:
|
||||
num = generateNumber(num_lenght, randint(0, 9))
|
||||
dataArray.append(num)
|
||||
if shuffle:
|
||||
shuffle(dataArray)
|
||||
return dataArray
|
||||
|
||||
|
||||
# Function for generating the content of one single row randomly
|
||||
def generateNumber(numberLenght, startingNumber):
|
||||
number = str(startingNumber)
|
||||
for length in range(0, numberLenght - 1):
|
||||
number = number + str(randint(0,9))
|
||||
return number
|
||||
number = str(startingNumber)
|
||||
for length in range(0, numberLenght - 1):
|
||||
number = number + str(randint(0, 9))
|
||||
return number
|
||||
|
||||
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
|
||||
|
||||
# Function for writing data into a file
|
||||
# content = string, nameChunkStart and namePartStart are for better naming
|
||||
# /testdata/ folder has to be created at this point
|
||||
def writeFile(content, nameChunkStart, namePartStart):
|
||||
filenumber = int(nameChunkStart) + int(namePartStart)
|
||||
file = open("testdata/file" + str(filenumber) + ".txt", "w")
|
||||
for w in range(0, len(content)):
|
||||
file.write(content[w] + "\n")
|
||||
filenumber = int(nameChunkStart) + int(namePartStart)
|
||||
file = open("testdata/file" + str(filenumber) + ".txt", "w")
|
||||
for w in range(0, len(content)):
|
||||
file.write(content[w] + "\n")
|
||||
|
||||
|
||||
# Function for generating 'entries'x int_lenght'-long numbers in 'clusters' clusters
|
||||
def numGen(entries, cluster, int_lenght, suffle_value):
|
||||
dataArray = []
|
||||
clusterArray = []
|
||||
dataArray = []
|
||||
clusterArray = []
|
||||
|
||||
for cluster_num in range(0, cluster):
|
||||
clusterArray.append(randint(10,99))
|
||||
for cluster_num in range(0, cluster):
|
||||
clusterArray.append(randint(10, 99))
|
||||
|
||||
for item in range(0, entries):
|
||||
decider = randint(0, 2)
|
||||
if decider == 2:
|
||||
dataArray.append(generateNumber(int_lenght, randint(1,9)))
|
||||
else:
|
||||
cluster_decider = randint(0, cluster - 1)
|
||||
dataArray.append(generateNumber(int_lenght - 1, clusterArray[cluster_decider]))
|
||||
for item in range(0, entries):
|
||||
decider = randint(0, 2)
|
||||
if decider == 2:
|
||||
dataArray.append(generateNumber(int_lenght, randint(1, 9)))
|
||||
else:
|
||||
cluster_decider = randint(0, cluster - 1)
|
||||
dataArray.append(
|
||||
generateNumber(
|
||||
int_lenght - 1,
|
||||
clusterArray[cluster_decider]
|
||||
))
|
||||
|
||||
if suffle_value:
|
||||
shuffle(dataArray)
|
||||
if suffle_value:
|
||||
shuffle(dataArray)
|
||||
|
||||
return dataArray
|
||||
return dataArray
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
#!/usr/bin/env python
|
||||
#title: kmeansMkI.py
|
||||
#description: Our personal Python K-Means++ implementation
|
||||
#author: Tillmann Brendel, Conrad Großer
|
||||
#license: Pending
|
||||
#date: 26.05.2018
|
||||
#version: 1.2
|
||||
#usage: python pyscript.py
|
||||
#notes:
|
||||
#dependencies: mathplotlib
|
||||
#known_issues: When clusters are 'thin' or noice is to strong --> unaccurate
|
||||
#python_version: 3.x
|
||||
#==============================================================================
|
||||
# title: kmeansMkI.py
|
||||
# description: Our personal Python K-Means++ implementation
|
||||
# author: Tillmann Brendel, Conrad Großer
|
||||
# license: Pending
|
||||
# date: 26.05.2018
|
||||
# version: 1.3
|
||||
# usage: python pyscript.py
|
||||
# notes:
|
||||
# dependencies: matplotlib
|
||||
# known_issues: When clusters are 'thin' or noice is to strong --> inaccurate
|
||||
# python_version: 3.x
|
||||
# ==============================================================================
|
||||
|
||||
# IMPORTS
|
||||
|
||||
@@ -28,112 +28,117 @@ import matplotlib.pyplot as plt
|
||||
import dmlib
|
||||
import dmtest
|
||||
|
||||
|
||||
# CODE
|
||||
# Main function of the algorithm
|
||||
def kmeansmk1(data, clusters):
|
||||
# Defining cluster points
|
||||
for i in range(0, clusters):
|
||||
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
|
||||
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
||||
# Defining cluster points
|
||||
for i in range(0, clusters):
|
||||
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
|
||||
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
||||
|
||||
# Get max value in the data array
|
||||
highPoint = dmlib.findHighest(data)
|
||||
# Get max value in the data array
|
||||
highPoint = max(data)
|
||||
|
||||
# Define variables for running the algorithm (runs is just for benchmarking!)
|
||||
done = 0
|
||||
runs = 0
|
||||
# Define variables for running the algorithm (runs is just for benchmarking!)
|
||||
done = False
|
||||
runs = 0
|
||||
|
||||
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
|
||||
while done == 0:
|
||||
runs = runs + 1
|
||||
new_data = assignCluster(data, highPoint, clusters)
|
||||
done = calcClusters(new_data, clusters)
|
||||
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
|
||||
while not done:
|
||||
runs += 1
|
||||
new_data = assignCluster(data, highPoint, clusters)
|
||||
done = calcClusters(new_data, clusters)
|
||||
|
||||
# Printing final clusters
|
||||
for i in range(0, clusters):
|
||||
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
||||
# Printing final clusters
|
||||
for i in range(0, clusters):
|
||||
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
||||
|
||||
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
|
||||
anew = []
|
||||
inew = 0
|
||||
while inew < len(data):
|
||||
anew.append(inew)
|
||||
inew = inew + 1
|
||||
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
|
||||
anew = []
|
||||
inew = 0
|
||||
while inew < len(data):
|
||||
anew.append(inew)
|
||||
inew = inew + 1
|
||||
|
||||
# Drawing found clusters as lines
|
||||
for i in range(0, clusters):
|
||||
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
|
||||
# Drawing found clusters as lines
|
||||
for i in range(0, clusters):
|
||||
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
|
||||
|
||||
# Showing graph
|
||||
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
|
||||
plt.show()
|
||||
# Showing graph
|
||||
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
|
||||
plt.show()
|
||||
|
||||
return 0
|
||||
|
||||
return 0
|
||||
|
||||
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
||||
def calcClusters(data, clusters):
|
||||
changed = 0
|
||||
for cluster in range(0, clusters):
|
||||
# Getting current cluster and saving it in temporary variable
|
||||
prev_cluster = globals()["cpoint_" + str(cluster)]
|
||||
# Sum of the cluster to calculate average difference between cluster center and data points
|
||||
clustersum = 0
|
||||
item_count = 0
|
||||
changed = False
|
||||
for cluster in range(0, clusters):
|
||||
# Getting current cluster and saving it in temporary variable
|
||||
prev_cluster = globals()["cpoint_" + str(cluster)]
|
||||
# Sum of the cluster to calculate average difference between cluster center and data points
|
||||
clustersum = 0
|
||||
item_count = 0
|
||||
|
||||
for item in range(0, len(data[0])):
|
||||
if data[1][item] == globals()["cpoint_" + str(cluster)]:
|
||||
clustersum = clustersum + int(data[0][item])
|
||||
item_count = item_count + 1
|
||||
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
|
||||
for item in range(0, len(data[0])):
|
||||
if data[1][item] == globals()["cpoint_" + str(cluster)]:
|
||||
clustersum = clustersum + int(data[0][item])
|
||||
item_count = item_count + 1
|
||||
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
|
||||
|
||||
# Checking if previous clusterpoint is equal to the one just calculated
|
||||
if prev_cluster == globals()["cpoint_" + str(cluster)]:
|
||||
changed = 1
|
||||
# Checking if previous clusterpoint is equal to the one just calculated
|
||||
if prev_cluster == globals()["cpoint_" + str(cluster)]:
|
||||
changed = True
|
||||
|
||||
return changed
|
||||
|
||||
return changed
|
||||
|
||||
def assignCluster(data, highPoint, clusters):
|
||||
# Create a new data array for working
|
||||
new_data = []
|
||||
new_data.append(data)
|
||||
# Create a new data array for working
|
||||
new_data = []
|
||||
new_data.append(data)
|
||||
|
||||
# Create new array for assigned clusters of each value
|
||||
data_assigned = []
|
||||
# Create new array for assigned clusters of each value
|
||||
data_assigned = []
|
||||
|
||||
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
|
||||
for item in range(0, len(new_data[0])):
|
||||
# Set the minimal cluster difference to the highest difference in the list to ease comparision
|
||||
min_cluster = highPoint
|
||||
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
|
||||
for item in range(0, len(new_data[0])):
|
||||
# Set the minimal cluster difference to the highest difference in the list to ease comparision
|
||||
min_cluster = highPoint
|
||||
|
||||
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
|
||||
for cluster in range(0, clusters):
|
||||
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
|
||||
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
|
||||
assinged_cluster = globals()["cpoint_" + str(cluster)]
|
||||
# Assign the minimal difference cluster to the data
|
||||
data_assigned.append(assinged_cluster)
|
||||
# Add the assigned values list to the new_data array
|
||||
new_data.append(data_assigned)
|
||||
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
|
||||
for cluster in range(0, clusters):
|
||||
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
|
||||
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
|
||||
assinged_cluster = globals()["cpoint_" + str(cluster)]
|
||||
# Assign the minimal difference cluster to the data
|
||||
data_assigned.append(assinged_cluster)
|
||||
# Add the assigned values list to the new_data array
|
||||
new_data.append(data_assigned)
|
||||
|
||||
return new_data
|
||||
|
||||
return new_data
|
||||
|
||||
# Startup function for collecting necesarry data
|
||||
def startup(data):
|
||||
# Using two clusters for testing
|
||||
clusters = int(input("How many clusters are known? "))
|
||||
# cores = input("How many cores should be used? ")
|
||||
# path = input("Where is the data? ") or in this case data
|
||||
|
||||
# For benchmarking starting the timer now
|
||||
start_time = time.time()
|
||||
# Using two clusters for testing
|
||||
clusters = int(input("How many clusters are known? "))
|
||||
# cores = input("How many cores should be used? ")
|
||||
# path = input("Where is the data? ") or in this case data
|
||||
|
||||
# Firing up the engines!
|
||||
kmeansmk1(data, clusters)
|
||||
# For benchmarking starting the timer now
|
||||
start_time = time.time()
|
||||
|
||||
# Firing up the engines!
|
||||
kmeansmk1(data, clusters)
|
||||
|
||||
# Stopping benchmark
|
||||
seconds = time.time() - start_time
|
||||
print(str(seconds) + " seconds for execution")
|
||||
|
||||
# Stopping benchmark
|
||||
seconds = time.time() - start_time
|
||||
print(str(seconds) + " seconds for execution")
|
||||
|
||||
# Start the algorithm and generate test data
|
||||
data = dmtest.numGen(10000, 2, 5, True)
|
||||
data = dmtest.numGen(10000, 10, 5, True)
|
||||
startup(data)
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
#!/usr/bin/env python
|
||||
# title: kmeansMkI.py
|
||||
# description: Our personal Python K-Means++ implementation
|
||||
# author: Tillmann Brendel, Conrad Großer
|
||||
# license: Pending
|
||||
# date: 04.06.2018
|
||||
# version: 1.5
|
||||
# usage: python pyscript.py
|
||||
# title: kmeansMkI.py
|
||||
# description: Our personal Python K-Means++ implementation
|
||||
# author: Tillmann Brendel, Conrad Großer
|
||||
# license: Pending
|
||||
# date: 04.06.2018
|
||||
# version: 1.6
|
||||
# usage: python pyscript.py
|
||||
# notes:
|
||||
# known_issues:
|
||||
# python_version: 3.x
|
||||
# python_version: 3.x
|
||||
# ==============================================================================
|
||||
|
||||
# IMPORTS
|
||||
@@ -31,79 +31,81 @@ import matplotlib.pyplot as plt
|
||||
import dmlib
|
||||
import dmtest
|
||||
|
||||
# CODE
|
||||
|
||||
# Main function of the algorithm
|
||||
def kmeansmk1(xdata, ydata, clusters):
|
||||
# Defining cluster points
|
||||
for i in range(0, clusters):
|
||||
globals()["cpoint_" + str(i)] = [xdata[randint(0, len(xdata))], ydata[randint(0, len(ydata))]]
|
||||
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
||||
#get max data in the data arrays
|
||||
highpointx = dmlib.findHighest(xdata)
|
||||
highpointy = dmlib.findHighest(ydata)
|
||||
#print('highpoinx: ' + str(highpointx))
|
||||
#print('highpointy: ' + str(highpointy))
|
||||
|
||||
# Get the maximum of the data
|
||||
highpointx = max(xdata)
|
||||
highpointy = max(ydata)
|
||||
|
||||
# Define variables for running the algorithm (runs is just as important as every other variable)
|
||||
done = 0
|
||||
done = False
|
||||
runs = 0
|
||||
|
||||
# As long as calcClusters returns done it will rearrange the clusters and assign the data to the clusters
|
||||
while done == 0:
|
||||
runs = runs + 1
|
||||
# As long as calcClusters returns False it will rearrange the clusters and assign the data to the clusters
|
||||
while not done:
|
||||
runs += 1
|
||||
assigned_points = assignCluster(xdata, ydata, clusters, highpointx, highpointy)
|
||||
#assigned_points consists of the clusternumbers
|
||||
# assigned_points consists of the clusternumbers
|
||||
done = calcClusters(xdata, ydata, assigned_points, clusters)
|
||||
|
||||
for i in range(0, clusters):
|
||||
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
||||
for i in range(0, clusters):
|
||||
plt.plot(globals()["cpoint_" + str(i)][0], globals()["cpoint_" + str(i)][1], 'ro')
|
||||
|
||||
plt.scatter([int(x) for x in xdata], [int(y) for y in ydata], marker='x', s=7, color='k')
|
||||
plt.show()
|
||||
|
||||
|
||||
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
||||
def calcClusters(xdata, ydata, assigned_points, clusters):
|
||||
for cluster in range(0, clusters):
|
||||
cpointunchanged = 1
|
||||
cpointunchanged = True
|
||||
globals()["oldcpoint_" + str(cluster)] = globals()["cpoint_" + str(cluster)]
|
||||
clustersumx = 0
|
||||
clustersumy = 0
|
||||
count = 0
|
||||
#print('calcclusters running')
|
||||
|
||||
for item in range(0, len(xdata)):
|
||||
if assigned_points[item] == cluster:
|
||||
clustersumx = clustersumx + int(xdata[item])
|
||||
clustersumy = clustersumy + int(ydata[item])
|
||||
count = count + 1
|
||||
# print('item ' + str(item) +'done')
|
||||
|
||||
globals()["cpoint_" + str(cluster)] = [round(clustersumx / count), round(clustersumy / count)]
|
||||
#print('cluster ' + str(cluster) + 'done')
|
||||
|
||||
# checking if old clusterpoint is equal to the one just calculated
|
||||
if globals()["oldcpoint_" + str(cluster)] != globals()["cpoint_" + str(cluster)]:
|
||||
cpointunchanged = 0
|
||||
cpointunchanged = False
|
||||
|
||||
return cpointunchanged
|
||||
|
||||
|
||||
def assignCluster(xdata, ydata, clusters, highpointx, highpointy):
|
||||
data_assigned = []
|
||||
assigned_cluster = 0
|
||||
resetdist = dmlib.calcdiff2d([0,0],[highpointx, highpointy])
|
||||
#print('resetdist =' + str(resetdist))
|
||||
resetdist = dmlib.calcdiff2d([0, 0], [highpointx, highpointy])
|
||||
|
||||
for item in range(0, len(xdata)):
|
||||
olddistance = resetdist
|
||||
for cluster in range(0, clusters):
|
||||
distance = dmlib.calcdiff2d(globals()["cpoint_" + str(cluster)], [xdata[item], ydata[item]])
|
||||
# print('distance from point ' + str(item) + ' to cluster ' + str(cluster) + ': ' + str(distance))
|
||||
|
||||
if distance < olddistance:
|
||||
olddistance = distance
|
||||
assigned_cluster = cluster
|
||||
# print('cluster number ' + str(cluster) + ' assigned')
|
||||
|
||||
data_assigned.append(assigned_cluster)
|
||||
# Add the assigned values list to the new_data array
|
||||
# new_data.append(data_assigned)
|
||||
|
||||
return data_assigned
|
||||
|
||||
|
||||
# Startup function for collecting necesarry xdata
|
||||
def startup(xdata, ydata):
|
||||
# Using two clusters for testing
|
||||
@@ -121,6 +123,7 @@ def startup(xdata, ydata):
|
||||
seconds = time.time() - start_time
|
||||
print(str(seconds) + " seconds for execution")
|
||||
|
||||
|
||||
# Start the algorithm and generate test xdata
|
||||
xdata = dmtest.numGenLight(10000, False, 5)
|
||||
ydata = dmtest.numGenLight(10000, False, 2)
|
||||
|
||||
Reference in New Issue
Block a user