kmeans Update 1.0
- Exported some functions in seperate libaries - Finished the algorithm, added calcCusters function - Optimized code
This commit is contained in:
BIN
src/algorithms/__pycache__/dmlib.cpython-36.pyc
Normal file
BIN
src/algorithms/__pycache__/dmlib.cpython-36.pyc
Normal file
Binary file not shown.
BIN
src/algorithms/__pycache__/dmtest.cpython-36.pyc
Normal file
BIN
src/algorithms/__pycache__/dmtest.cpython-36.pyc
Normal file
Binary file not shown.
22
src/algorithms/dmlib.py
Normal file
22
src/algorithms/dmlib.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# Calculate the difference between two points giving the indexes of these data entries
|
||||||
|
def calcdiff(point1, point2, data):
|
||||||
|
if int(point2) > int(point1):
|
||||||
|
difference = int(point2) - int(point1)
|
||||||
|
else:
|
||||||
|
difference = int(point1) - int(point2)
|
||||||
|
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
|
||||||
|
return betrag(difference)
|
||||||
|
|
||||||
|
# Get the absolute value of a number and returns it as int
|
||||||
|
def betrag(number):
|
||||||
|
if number < 0:
|
||||||
|
number = int((-2 * number) / 2)
|
||||||
|
return number
|
||||||
|
|
||||||
|
# Determine the highest int value in an array and returns is as an int
|
||||||
|
def findHighest(data):
|
||||||
|
maximum = 0
|
||||||
|
for i in range(0, len(data)):
|
||||||
|
if int(data[i]) > maximum:
|
||||||
|
maximum = int(data[i])
|
||||||
|
return maximum
|
||||||
28
src/algorithms/dmtest.py
Normal file
28
src/algorithms/dmtest.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# For random generation of numbers import randint
|
||||||
|
from random import randint, shuffle
|
||||||
|
|
||||||
|
# Simple generator for test data (100 plzs, 20-30-50 biased), returns 1D array of plzs
|
||||||
|
def testgenerator():
|
||||||
|
dataArray = []
|
||||||
|
for i in range(0,100):
|
||||||
|
if i <= 40:
|
||||||
|
plz = generatePLZ("05")
|
||||||
|
elif i > 40 and i < 80:
|
||||||
|
plz = generatePLZ("50")
|
||||||
|
else:
|
||||||
|
plz = generatePLZ("")
|
||||||
|
dataArray.append(plz)
|
||||||
|
shuffle(dataArray)
|
||||||
|
return dataArray
|
||||||
|
|
||||||
|
# Generates a PLZ from a certain start point
|
||||||
|
def generatePLZ(start):
|
||||||
|
if len(start) == 0:
|
||||||
|
plz = ""
|
||||||
|
for j in range(1,6):
|
||||||
|
plz = plz + str(randint(0,9))
|
||||||
|
else:
|
||||||
|
plz = start
|
||||||
|
for j in range(1,4):
|
||||||
|
plz = plz + str(randint(0,9))
|
||||||
|
return plz
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
#description: Our personal Python K-Means++ implementation
|
#description: Our personal Python K-Means++ implementation
|
||||||
#author: Tillmann Brendel, Conrad Großer
|
#author: Tillmann Brendel, Conrad Großer
|
||||||
#date: 26.05.2018
|
#date: 26.05.2018
|
||||||
#version: 0.2
|
#version: 1.0
|
||||||
#usage: python pyscript.py
|
#usage: python pyscript.py
|
||||||
#notes:
|
#notes:
|
||||||
#known_issues:
|
#known_issues:
|
||||||
@@ -16,124 +16,92 @@
|
|||||||
import time
|
import time
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
|
||||||
# For random generation of numbers import randint and shuffle to shuffle an array
|
# For random generation of numbers import randint
|
||||||
from random import randint, shuffle
|
from random import randint
|
||||||
|
|
||||||
# Importing libary for multi core processing
|
# Importing libary for multi core processing
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
||||||
|
# Importing own libaries Datamining Libary and Datamining Test
|
||||||
|
import dmlib
|
||||||
|
import dmtest
|
||||||
|
|
||||||
# CODE
|
# CODE
|
||||||
# Main function of the algorithm
|
# Main function of the algorithm
|
||||||
def kmeansmk1(data):
|
def kmeansmk1(data, clusters, runs):
|
||||||
# Using two clusters for testing
|
# Defining cluster points
|
||||||
clusters = 2
|
for i in range(0, clusters):
|
||||||
|
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
|
||||||
|
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
||||||
|
|
||||||
|
# Get max value in the data array
|
||||||
|
highPoint = dmlib.findHighest(data)
|
||||||
|
|
||||||
|
for run in range(0, runs):
|
||||||
|
new_data = assignCluster(data, highPoint, clusters)
|
||||||
|
calcClusters(new_data, clusters)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
||||||
|
def calcClusters(data, clusters):
|
||||||
|
for cluster in range(0, clusters):
|
||||||
|
clustersum = 0
|
||||||
|
count = 0
|
||||||
|
for item in range(0, len(data[0])):
|
||||||
|
if data[1][item] == globals()["cpoint_" + str(cluster)]:
|
||||||
|
clustersum = clustersum + int(data[0][item])
|
||||||
|
count = count + 1
|
||||||
|
globals()["cpoint_" + str(cluster)] = round(clustersum / count)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def assignCluster(data, highPoint, clusters):
|
||||||
# Create a new data array for working
|
# Create a new data array for working
|
||||||
new_data = []
|
new_data = []
|
||||||
new_data.append(data)
|
new_data.append(data)
|
||||||
|
|
||||||
# Get the size of the data array
|
|
||||||
data_size = len(new_data[0])
|
|
||||||
|
|
||||||
# Defining cluster points
|
|
||||||
for i in range(0, clusters):
|
|
||||||
globals()["cpoint_" + str(i)] = randint(0, data_size)
|
|
||||||
print("Cluster " + str(i) + ": " + str(new_data[0][globals()["cpoint_" + str(i)]]))
|
|
||||||
|
|
||||||
# Create new array for assigned clusters of each value
|
# Create new array for assigned clusters of each value
|
||||||
data_assigned = []
|
data_assigned = []
|
||||||
|
|
||||||
# Get max value in the data array
|
|
||||||
highPoint = findHighest(new_data[0])
|
|
||||||
|
|
||||||
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
|
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
|
||||||
for item in range(0, data_size):
|
for item in range(0, len(new_data[0])):
|
||||||
# Set the minimal cluster difference to the highest difference in the list to ease comparision
|
# Set the minimal cluster difference to the highest difference in the list to ease comparision
|
||||||
min_cluster = highPoint
|
min_cluster = highPoint
|
||||||
|
|
||||||
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
|
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
|
||||||
for cluster in range(0, clusters):
|
for cluster in range(0, clusters):
|
||||||
clusternumber = globals()["cpoint_" + str(cluster)]
|
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0]):
|
||||||
if min_cluster > calcdiff(item, clusternumber, new_data[0]):
|
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)], new_data[0])
|
||||||
min_cluster = calcdiff(item, clusternumber, new_data[0])
|
assinged_cluster = globals()["cpoint_" + str(cluster)]
|
||||||
assinged_cluster = clusternumber
|
|
||||||
# Assign the minimal difference cluster to the data
|
# Assign the minimal difference cluster to the data
|
||||||
data_assigned.append(assinged_cluster)
|
data_assigned.append(assinged_cluster)
|
||||||
# Add the assigned values list to the new_data array
|
# Add the assigned values list to the new_data array
|
||||||
new_data.append(data_assigned)
|
new_data.append(data_assigned)
|
||||||
|
|
||||||
# Print out the list of datapoints and assigned clusters
|
|
||||||
for item in range(0, len(new_data[0])):
|
|
||||||
print("Datapoint: " + str(new_data[0][item]) + " | Assigned cluster: " + str(new_data[0][new_data[1][item]]))
|
|
||||||
|
|
||||||
return new_data
|
return new_data
|
||||||
|
|
||||||
# Determine the highest int value in an array
|
|
||||||
def findHighest(data):
|
|
||||||
maximum = 0
|
|
||||||
for i in range(0, len(data)):
|
|
||||||
if int(data[i]) > maximum:
|
|
||||||
maximum = int(data[i])
|
|
||||||
return maximum
|
|
||||||
|
|
||||||
# Calculate the difference between two points giving the indexes of these data entries
|
|
||||||
def calcdiff(point1, point2, data):
|
|
||||||
if int(data[point2]) > int(data[point1]):
|
|
||||||
difference = int(data[point2]) - int(data[point1])
|
|
||||||
else:
|
|
||||||
difference = int(data[point1]) - int(data[point2])
|
|
||||||
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
|
|
||||||
return betrag(difference)
|
|
||||||
|
|
||||||
# Get the absolute value of a number
|
|
||||||
def betrag(number):
|
|
||||||
if number < 0:
|
|
||||||
number = int((-2 * number) / 2)
|
|
||||||
return number
|
|
||||||
|
|
||||||
# Startup function for collecting necesarry data
|
# Startup function for collecting necesarry data
|
||||||
def startup(data):
|
def startup(data):
|
||||||
|
# Using two clusters for testing
|
||||||
# clusters = int(input("How many clusters are known? "))
|
# clusters = int(input("How many clusters are known? "))
|
||||||
|
clusters = 2
|
||||||
# cores = input("How many cores should be used? ")
|
# cores = input("How many cores should be used? ")
|
||||||
# path = input("Where is the data? ") or in this case data
|
# path = input("Where is the data? ") or in this case data
|
||||||
|
|
||||||
|
# runs = int(input("How many runs are sufficient? "))
|
||||||
|
runs = 500
|
||||||
|
|
||||||
# For benchmarking starting the timer now
|
# For benchmarking starting the timer now
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Firing up the engines!
|
# Firing up the engines!
|
||||||
kmeansmk1(data)
|
kmeansmk1(data, clusters, runs)
|
||||||
# kmeansmk1(clusters, cores, data)
|
|
||||||
|
|
||||||
# Stopping benchmark
|
# Stopping benchmark
|
||||||
seconds = time.time() - start_time
|
seconds = time.time() - start_time
|
||||||
# print(str(seconds) + " seconds for execution")
|
# print(str(seconds) + " seconds for execution")
|
||||||
|
|
||||||
# Simple generator for test data
|
|
||||||
def testgenerator():
|
|
||||||
dataArray = []
|
|
||||||
for i in range(0,100):
|
|
||||||
if i <= 20:
|
|
||||||
plz = generatePLZ("09")
|
|
||||||
elif i > 20 and i < 50:
|
|
||||||
plz = generatePLZ("08")
|
|
||||||
else:
|
|
||||||
plz = generatePLZ("")
|
|
||||||
dataArray.append(plz)
|
|
||||||
shuffle(dataArray)
|
|
||||||
return dataArray
|
|
||||||
|
|
||||||
# Generates a PLZ from a certain start point
|
|
||||||
def generatePLZ(start):
|
|
||||||
if len(start) == 0:
|
|
||||||
plz = ""
|
|
||||||
for j in range(1,6):
|
|
||||||
plz = plz + str(randint(0,9))
|
|
||||||
else:
|
|
||||||
plz = start
|
|
||||||
for j in range(1,4):
|
|
||||||
plz = plz + str(randint(0,9))
|
|
||||||
return plz
|
|
||||||
|
|
||||||
# Start the algorithm and generate test data
|
# Start the algorithm and generate test data
|
||||||
data = testgenerator()
|
data = dmtest.testgenerator()
|
||||||
|
|
||||||
startup(data)
|
startup(data)
|
||||||
Reference in New Issue
Block a user