data_mining_algorithms/src/algorithms/kmeansMkI.py

#!/usr/bin/env python
#title:				kmeansMkI.py
#description:		Our personal Python K-Means++ implementation
#author:			Tillmann Brendel, Conrad Großer
#license:			Pending
#date:				26.05.2018
#version:			1.2
#usage:				python pyscript.py
#notes:
#dependencies:		mathplotlib
#known_issues:		When clusters are 'thin' or noice is to strong --> unaccurate
#python_version:	3.x
#==============================================================================

# IMPORTS

# Importing the time for benchmarking purposes
import time
from datetime import date

# For random generation of numbers import randint
from random import randint

# Importing libaries for easy plotting
import matplotlib.pyplot as plt

# Importing own libaries Datamining Libary and Datamining Test
import dmlib
import dmtest

# CODE
# Main function of the algorithm
def kmeansmk1(data, clusters):
	# Defining cluster points
	for i in range(0, clusters):
		globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
		print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))

	# Get max value in the data array
	highPoint = dmlib.findHighest(data)

	# Define variables for running the algorithm (runs is just for benchmarking!)
	done = 0
	runs = 0

	# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
	while done == 0:
		runs = runs + 1
		new_data = assignCluster(data, highPoint, clusters)
		done = calcClusters(new_data, clusters)

	# Printing final clusters
	for i in range(0, clusters):
		print("Endcluster " + str(i + 1) + " is calculated to be at  " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")

	# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
	anew = []
	inew = 0
	while inew < len(data):
		anew.append(inew)
		inew = inew + 1

	# Drawing found clusters as lines
	for i in range(0, clusters):
		plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')

	# Showing graph
	plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
	plt.show()

	return 0

# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters):
	changed = 0
	for cluster in range(0, clusters):
		# Getting current cluster and saving it in temporary variable
		prev_cluster = globals()["cpoint_" + str(cluster)]
		# Sum of the cluster to calculate average difference between cluster center and data points
		clustersum = 0
		item_count = 0

		for item in range(0, len(data[0])):
			if data[1][item] == globals()["cpoint_" + str(cluster)]:
				clustersum = clustersum + int(data[0][item])
				item_count = item_count + 1
		globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)

		# Checking if previous clusterpoint is equal to the one just calculated
		if prev_cluster == globals()["cpoint_" + str(cluster)]:
			changed = 1

	return changed

def assignCluster(data, highPoint, clusters):
	# Create a new data array for working
	new_data = []
	new_data.append(data)

	# Create new array for assigned clusters of each value
	data_assigned = []

	# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
	for item in range(0, len(new_data[0])):
		# Set the minimal cluster difference to the highest difference in the list to ease comparision
		min_cluster = highPoint

		# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
		for cluster in range(0, clusters):
			if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
				min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
				assinged_cluster = globals()["cpoint_" + str(cluster)]
		# Assign the minimal difference cluster to the data
		data_assigned.append(assinged_cluster)
	# Add the assigned values list to the new_data array
	new_data.append(data_assigned)

	return new_data

# Startup function for collecting necesarry data
def startup(data):
	# Using two clusters for testing
	clusters = int(input("How many clusters are known? "))
	# cores = input("How many cores should be used? ")
	# path = input("Where is the data? ") or in this case data

	# For benchmarking starting the timer now
	start_time = time.time()

	# Firing up the engines!
	kmeansmk1(data, clusters)

	# Stopping benchmark
	seconds = time.time() - start_time
	print(str(seconds) + " seconds for execution")

# Start the algorithm and generate test data
# data = dmtest.plzGen(10000)
# data = dmtest.numGen(10000, 3, 5)

data = dmtest.numGen(10000, 8, 7)
startup(data)