kmeans Update 1.0

- Exported some functions in seperate libaries
- Finished the algorithm, added calcCusters function
- Optimized code
This commit is contained in:
2018-05-30 23:22:02 +02:00
parent 5f9143b604
commit 77f3a16c45
5 changed files with 96 additions and 78 deletions

Binary file not shown.

Binary file not shown.

22
src/algorithms/dmlib.py Normal file
View File

@@ -0,0 +1,22 @@
# Calculate the difference between two points giving the indexes of these data entries
def calcdiff(point1, point2, data):
if int(point2) > int(point1):
difference = int(point2) - int(point1)
else:
difference = int(point1) - int(point2)
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
return betrag(difference)
# Get the absolute value of a number and returns it as int
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Determine the highest int value in an array and returns is as an int
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum

28
src/algorithms/dmtest.py Normal file
View File

@@ -0,0 +1,28 @@
# For random generation of numbers import randint
from random import randint, shuffle
# Simple generator for test data (100 plzs, 20-30-50 biased), returns 1D array of plzs
def testgenerator():
dataArray = []
for i in range(0,100):
if i <= 40:
plz = generatePLZ("05")
elif i > 40 and i < 80:
plz = generatePLZ("50")
else:
plz = generatePLZ("")
dataArray.append(plz)
shuffle(dataArray)
return dataArray
# Generates a PLZ from a certain start point
def generatePLZ(start):
if len(start) == 0:
plz = ""
for j in range(1,6):
plz = plz + str(randint(0,9))
else:
plz = start
for j in range(1,4):
plz = plz + str(randint(0,9))
return plz

View File

@@ -3,7 +3,7 @@
#description: Our personal Python K-Means++ implementation
#author: Tillmann Brendel, Conrad Großer
#date: 26.05.2018
#version: 0.2
#version: 1.0
#usage: python pyscript.py
#notes:
#known_issues:
@@ -16,124 +16,92 @@
import time
from datetime import date
# For random generation of numbers import randint and shuffle to shuffle an array
from random import randint, shuffle
# For random generation of numbers import randint
from random import randint
# Importing libary for multi core processing
import multiprocessing
# Importing own libaries Datamining Libary and Datamining Test
import dmlib
import dmtest
# CODE
# Main function of the algorithm
def kmeansmk1(data):
# Using two clusters for testing
clusters = 2
def kmeansmk1(data, clusters, runs):
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Get max value in the data array
highPoint = dmlib.findHighest(data)
for run in range(0, runs):
new_data = assignCluster(data, highPoint, clusters)
calcClusters(new_data, clusters)
return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters):
for cluster in range(0, clusters):
clustersum = 0
count = 0
for item in range(0, len(data[0])):
if data[1][item] == globals()["cpoint_" + str(cluster)]:
clustersum = clustersum + int(data[0][item])
count = count + 1
globals()["cpoint_" + str(cluster)] = round(clustersum / count)
return 0
def assignCluster(data, highPoint, clusters):
# Create a new data array for working
new_data = []
new_data.append(data)
# Get the size of the data array
data_size = len(new_data[0])
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = randint(0, data_size)
print("Cluster " + str(i) + ": " + str(new_data[0][globals()["cpoint_" + str(i)]]))
# Create new array for assigned clusters of each value
data_assigned = []
# Get max value in the data array
highPoint = findHighest(new_data[0])
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, data_size):
for item in range(0, len(new_data[0])):
# Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters):
clusternumber = globals()["cpoint_" + str(cluster)]
if min_cluster > calcdiff(item, clusternumber, new_data[0]):
min_cluster = calcdiff(item, clusternumber, new_data[0])
assinged_cluster = clusternumber
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0]):
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0])
assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array
new_data.append(data_assigned)
# Print out the list of datapoints and assigned clusters
for item in range(0, len(new_data[0])):
print("Datapoint: " + str(new_data[0][item]) + " | Assigned cluster: " + str(new_data[0][new_data[1][item]]))
return new_data
# Determine the highest int value in an array
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum
# Calculate the difference between two points giving the indexes of these data entries
def calcdiff(point1, point2, data):
if int(data[point2]) > int(data[point1]):
difference = int(data[point2]) - int(data[point1])
else:
difference = int(data[point1]) - int(data[point2])
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
return betrag(difference)
# Get the absolute value of a number
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Startup function for collecting necesarry data
def startup(data):
# Using two clusters for testing
# clusters = int(input("How many clusters are known? "))
clusters = 2
# cores = input("How many cores should be used? ")
# path = input("Where is the data? ") or in this case data
# runs = int(input("How many runs are sufficient? "))
runs = 500
# For benchmarking starting the timer now
start_time = time.time()
# Firing up the engines!
kmeansmk1(data)
# kmeansmk1(clusters, cores, data)
kmeansmk1(data, clusters, runs)
# Stopping benchmark
seconds = time.time() - start_time
# print(str(seconds) + " seconds for execution")
# Simple generator for test data
def testgenerator():
dataArray = []
for i in range(0,100):
if i <= 20:
plz = generatePLZ("09")
elif i > 20 and i < 50:
plz = generatePLZ("08")
else:
plz = generatePLZ("")
dataArray.append(plz)
shuffle(dataArray)
return dataArray
# Generates a PLZ from a certain start point
def generatePLZ(start):
if len(start) == 0:
plz = ""
for j in range(1,6):
plz = plz + str(randint(0,9))
else:
plz = start
for j in range(1,4):
plz = plz + str(randint(0,9))
return plz
# Start the algorithm and generate test data
data = testgenerator()
data = dmtest.testgenerator()
startup(data)