143 lines
4.6 KiB
Python
143 lines
4.6 KiB
Python
#!/usr/bin/env python
|
|
#title: kmeansMkI.py
|
|
#description: Our personal Python K-Means++ implementation
|
|
#author: Tillmann Brendel, Conrad Großer
|
|
#license: Pending
|
|
#date: 26.05.2018
|
|
#version: 1.2
|
|
#usage: python pyscript.py
|
|
#notes:
|
|
#dependencies: mathplotlib
|
|
#known_issues: When clusters are 'thin' or noice is to strong --> unaccurate
|
|
#python_version: 3.x
|
|
#==============================================================================
|
|
|
|
# IMPORTS
|
|
|
|
# Importing the time for benchmarking purposes
|
|
import time
|
|
from datetime import date
|
|
|
|
# For random generation of numbers import randint
|
|
from random import randint
|
|
|
|
# Importing libaries for easy plotting
|
|
import matplotlib.pyplot as plt
|
|
|
|
# Importing own libaries Datamining Libary and Datamining Test
|
|
import dmlib
|
|
import dmtest
|
|
|
|
# CODE
|
|
# Main function of the algorithm
|
|
def kmeansmk1(data, clusters):
|
|
# Defining cluster points
|
|
for i in range(0, clusters):
|
|
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
|
|
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
|
|
|
# Get max value in the data array
|
|
highPoint = dmlib.findHighest(data)
|
|
|
|
# Define variables for running the algorithm (runs is just for benchmarking!)
|
|
done = 0
|
|
runs = 0
|
|
|
|
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
|
|
while done == 0:
|
|
runs = runs + 1
|
|
new_data = assignCluster(data, highPoint, clusters)
|
|
done = calcClusters(new_data, clusters)
|
|
|
|
# Printing final clusters
|
|
for i in range(0, clusters):
|
|
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
|
|
|
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
|
|
anew = []
|
|
inew = 0
|
|
while inew < len(data):
|
|
anew.append(inew)
|
|
inew = inew + 1
|
|
|
|
# Drawing found clusters as lines
|
|
for i in range(0, clusters):
|
|
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
|
|
|
|
# Showing graph
|
|
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
|
|
plt.show()
|
|
|
|
return 0
|
|
|
|
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
|
def calcClusters(data, clusters):
|
|
changed = 0
|
|
for cluster in range(0, clusters):
|
|
# Getting current cluster and saving it in temporary variable
|
|
prev_cluster = globals()["cpoint_" + str(cluster)]
|
|
# Sum of the cluster to calculate average difference between cluster center and data points
|
|
clustersum = 0
|
|
item_count = 0
|
|
|
|
for item in range(0, len(data[0])):
|
|
if data[1][item] == globals()["cpoint_" + str(cluster)]:
|
|
clustersum = clustersum + int(data[0][item])
|
|
item_count = item_count + 1
|
|
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
|
|
|
|
# Checking if previous clusterpoint is equal to the one just calculated
|
|
if prev_cluster == globals()["cpoint_" + str(cluster)]:
|
|
changed = 1
|
|
|
|
return changed
|
|
|
|
def assignCluster(data, highPoint, clusters):
|
|
# Create a new data array for working
|
|
new_data = []
|
|
new_data.append(data)
|
|
|
|
# Create new array for assigned clusters of each value
|
|
data_assigned = []
|
|
|
|
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
|
|
for item in range(0, len(new_data[0])):
|
|
# Set the minimal cluster difference to the highest difference in the list to ease comparision
|
|
min_cluster = highPoint
|
|
|
|
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
|
|
for cluster in range(0, clusters):
|
|
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
|
|
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
|
|
assinged_cluster = globals()["cpoint_" + str(cluster)]
|
|
# Assign the minimal difference cluster to the data
|
|
data_assigned.append(assinged_cluster)
|
|
# Add the assigned values list to the new_data array
|
|
new_data.append(data_assigned)
|
|
|
|
return new_data
|
|
|
|
# Startup function for collecting necesarry data
|
|
def startup(data):
|
|
# Using two clusters for testing
|
|
clusters = int(input("How many clusters are known? "))
|
|
# cores = input("How many cores should be used? ")
|
|
# path = input("Where is the data? ") or in this case data
|
|
|
|
# For benchmarking starting the timer now
|
|
start_time = time.time()
|
|
|
|
# Firing up the engines!
|
|
kmeansmk1(data, clusters)
|
|
|
|
# Stopping benchmark
|
|
seconds = time.time() - start_time
|
|
print(str(seconds) + " seconds for execution")
|
|
|
|
# Start the algorithm and generate test data
|
|
# data = dmtest.plzGen(10000)
|
|
# data = dmtest.numGen(10000, 3, 5)
|
|
|
|
data = dmtest.numGen(10000, 8, 7)
|
|
startup(data)
|