Compare commits

3 Commits

Author SHA1 Message Date
94ed193954 Update .gitignore 2019-04-25 12:45:05 +02:00
e2ad63f90f Code Rework 2019-04-25 12:43:37 +02:00
25fa068df9 kmeans ++ Update
- Added concept for kmeans ++
2018-06-02 23:31:43 +02:00
6 changed files with 276 additions and 255 deletions

1
.gitignore vendored
View File

@@ -1,2 +1,3 @@
testdata/
__pycache__/
.DS_Store

View File

@@ -1,40 +1,27 @@
# Calculate the difference between two points giving the indexes of these data entries
def calcdiff(point1, point2):
if int(point2) > int(point1):
difference = int(point2) - int(point1)
else:
difference = int(point1) - int(point2)
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
return betrag(difference)
if int(point2) > int(point1):
difference = int(point2) - int(point1)
else:
difference = int(point1) - int(point2)
return abs(difference)
# Get the absolute value of a number and returns it as int
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Determine the highest int value in an array and returns is as an int
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum
def pp_calcdiff(data, clusterpoint):
max_diff = 0
new_cluster = 0
for item in range(0,len(data)):
if calcdiff(data[item], clusterpoint) > max_diff:
max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item]
return new_cluster
max_diff = 0
new_cluster = 0
for item in range(0, len(data)):
if calcdiff(data[item], clusterpoint) > max_diff:
max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item]
return new_cluster
def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
max_diff = 0
new_cluster = 0
for item in range(0,len(data)):
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item]
return new_cluster
max_diff = 0
new_cluster = 0
for item in range(0, len(data)):
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item]
return new_cluster

View File

@@ -1,34 +1,38 @@
# For random generation of numbers import randint
from random import randint, shuffle
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
def plzGen(entries):
dataArray = []
plz_lenght = 5
for i in range(0, int(entries)):
if i < round(entries * 0.4):
plz = generateNumber(plz_lenght, 2)
elif i >= round(entries * 0.4) and i < round(entries * 0.6):
plz = generateNumber(plz_lenght, 9)
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
plz = generateNumber(plz_lenght, 4)
else:
plz = generateNumber(plz_lenght, randint(0,9))
dataArray.append(plz)
shuffle(dataArray)
return dataArray
dataArray = []
plz_lenght = 5
for i in range(0, int(entries)):
if i < round(entries * 0.4):
plz = generateNumber(plz_lenght, 2)
elif i >= round(entries * 0.4) and i < round(entries * 0.6):
plz = generateNumber(plz_lenght, 9)
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
plz = generateNumber(plz_lenght, 4)
else:
plz = generateNumber(plz_lenght, randint(0, 9))
dataArray.append(plz)
shuffle(dataArray)
return dataArray
# Function for generating the content of one single row randomly
def generateNumber(numberLenght, startingNumber):
number = str(startingNumber)
for length in range(0, numberLenght - 1):
number = number + str(randint(0,9))
return number
number = str(startingNumber)
for length in range(0, numberLenght - 1):
number = number + str(randint(0, 9))
return number
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
# /testdata/ folder has to be created at this point
def writeFile(content, nameChunkStart, namePartStart):
filenumber = int(nameChunkStart) + int(namePartStart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")
filenumber = int(nameChunkStart) + int(namePartStart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")

View File

@@ -1,19 +1,18 @@
#!/usr/bin/env python
#title: kmeansMkI.py
#description: Our personal Python K-Means++ implementation
#author: Tillmann Brendel, Conrad Großer
#license: Pending
#date: 26.05.2018
#version: 1.2
#usage: python pyscript.py
#notes:
#dependencies: mathplotlib
#known_issues:
#python_version: 3.x
#==============================================================================
# title: kmeansMkI.py
# description: Our personal Python K-Means++ implementation
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 26.05.2018
# version: 1.2
# usage: python pyscript.py
# notes:
# dependencies: mathplotlib
# known_issues:
# python_version: 3.x
# ==============================================================================
# IMPORTS
# Importing the time for benchmarking purposes
import time
from datetime import date
@@ -28,111 +27,120 @@ import matplotlib.pyplot as plt
import dmlib
import dmtest
# CODE
# Main function of the algorithm
def kmeansmk1(data, clusters):
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
globals()["cpoint_0"] = data[randint(0, len(data))]
globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
# Get max value in the data array
highPoint = dmlib.findHighest(data)
print("Initial cluster 1: " + str(globals()["cpoint_0"]))
print("Initial cluster 2: " + str(globals()["cpoint_1"]))
# Define variables for running the algorithm (runs is just for benchmarking!)
done = 0
runs = 0
# Defining cluster points
for i in range(2, clusters):
globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while done == 0:
runs = runs + 1
new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters)
# Get max value in the data array
highPoint = max(data)
# Printing final clusters
for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Define variables for running the algorithm (runs is just for benchmarking!)
done, runs = False, 0
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
anew = []
inew = 0
while inew < len(data):
anew.append(inew)
inew = inew + 1
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while not done:
runs += 1
new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters)
# Drawing found clusters as lines
for i in range(0, clusters):
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
# Printing final clusters
for i in range(0, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Showing graph
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
plt.show()
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
anew, inew = [], 0
while inew < len(data):
anew.append(inew)
inew += 1
# Drawing found clusters as lines
for i in range(0, clusters):
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
# Showing graph
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
plt.show()
return 0
return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters):
changed = 0
for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)]
# Sum of the cluster to calculate average difference between cluster center and data points
clustersum = 0
item_count = 0
changed = False
for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)]
# Sum of the cluster to calculate average difference between cluster center and data points
clustersum = 0
item_count = 0
for item in range(0, len(data[0])):
if data[1][item] == globals()["cpoint_" + str(cluster)]:
clustersum = clustersum + int(data[0][item])
item_count = item_count + 1
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
for item in range(0, len(data[0])):
if data[1][item] == globals()["cpoint_" + str(cluster)]:
clustersum = clustersum + int(data[0][item])
item_count = item_count + 1
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
# Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = 1
# Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = True
return changed
return changed
def assignCluster(data, highPoint, clusters):
# Create a new data array for working
new_data = []
new_data.append(data)
# Create a new data array for working
new_data = [data]
# Create new array for assigned clusters of each value
data_assigned = []
# Create new array for assigned clusters of each value
data_assigned = []
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, len(new_data[0])):
# Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in data:
# Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters):
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array
new_data.append(data_assigned)
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters):
if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)])
assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array
new_data.append(data_assigned)
return new_data
return new_data
# Startup function for collecting necesarry data
def startup(data):
# Using two clusters for testing
clusters = int(input("How many clusters are known? "))
# cores = input("How many cores should be used? ")
# path = input("Where is the data? ") or in this case data
# For benchmarking starting the timer now
start_time = time.time()
# Using two clusters for testing
clusters = int(input("How many clusters are known? "))
# cores = input("How many cores should be used? ")
# path = input("Where is the data? ") or in this case data
# Firing up the engines!
kmeansmk1(data, clusters)
# For benchmarking starting the timer now
start_time = time.time()
# Firing up the engines!
kmeansmk1(data, clusters)
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Stopping benchmark
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test data
data = dmtest.plzGen(10000)

View File

@@ -10,69 +10,79 @@ from datetime import date
# Importing for multi core processing
import multiprocessing
# randomI function which creates each file
def randomI(units, rows, rowLength, partstart):
for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength), setcounter, partstart)
for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength), setcounter, partstart)
return True
# Function for generating the content of one single file
def generateFile(rows, rowLength):
content = []
for y in range(0, rows):
content.append(generateRow(rowLength))
return content
content = []
for entry in rows:
content.append(generateRow(rowLength))
return content
# Function for generating the content of one single row randomly
def generateRow(rowLength):
row = ""
for z in range(0, rowLength):
row = row + str(randint(0, 9))
return row
row = ""
for z in range(0, rowLength):
row += str(randint(0, 9))
return row
# Function for writing data into a file
def writeFile(content, setcounter, partstart):
filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")
filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for line in content:
file.write(line + "\n")
return True
if __name__ == '__main__':
# Getting the user input
print("Hello World")
units = int(input("How many units would you like to generate? "))
rows = int(input("How many rows should each unit have? "))
rowLength = int(input("How long should each row be? "))
cores = int(input("How many cores do you want to use? "))
# Getting the user input
print("Hello World")
units = int(input("How many units would you like to generate? "))
rows = int(input("How many rows should each unit have? "))
rowLength = int(input("How long should each row be? "))
cores = int(input("How many cores do you want to use? "))
# Splitting up the units
count = int(0)
partsize = units / cores
# Splitting up the units
count = 0
partsize = units / cores
# For benchmarking starting the timer now
start_time = time.time()
# For benchmarking starting the timer now
start_time = time.time()
# Initialize and prepare cores for process
while count < cores:
partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart))
count = count + 1
# Initialize and prepare cores for process
while count < cores:
partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(
target=randomI,
args=(int(partsize), rows, rowLength, partstart)
)
count += 1
# Starting each core
count = int(0)
while count < cores:
globals()["p" + str(count)].start()
print("Core " + str(count) + " started.")
count = count + 1
# Starting each core
count = 0
while count < cores:
globals()["p" + str(count)].start()
print("Core " + str(count) + " started.")
count += 1
print("Working...")
print("Working...")
# Joining each core for the process
count = int(0)
while count < cores:
globals()["p" + str(count)].join()
count = count + 1
# Joining each core for the process
count = 0
while count < cores:
globals()["p" + str(count)].join()
count += 1
# Finishing up the process
sec = time.time() - start_time
print("Data is generated. Have fun!")
print("randomI took " + str(sec) + " seconds for execution.")
# Finishing up the process
sec = time.time() - start_time
print("Data is generated. Have fun!")
print("randomI took " + str(sec) + " seconds for execution.")

View File

@@ -1,15 +1,15 @@
#!/usr/bin/env python
#title: randomI2.1.py
#description: Personal
#author: Tillmann Brendel, Conrad Großer
#license: Pending
#date: 26.05.2018
#version: 1.0
#usage: python pyscript.py
#notes:
#known_issues:
#python_version: 3.x
#==============================================================================
# title: randomI2.1.py
# description: Personal
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 26.05.2018
# version: 1.0
# usage: python pyscript.py
# notes:
# known_issues:
# python_version: 3.x
# ==============================================================================
# For random generation of numbers import randint
from random import randint
@@ -21,76 +21,87 @@ from datetime import date
# Importing for multi core processing
import multiprocessing
# randomI function which creates each file
def randomI(units, rows, rowLength, partstart, cluster):
for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
return True
# Function for generating the content of one single file
def generateFile(rows, rowLength, cluster):
content = []
for y in range(0, rows):
if y == 0:
if 1 == randint(1, cluster):
content.append(generate09())
else:
content.append(generatePLZ())
else:
content.append(generateRow(rowLength))
return content
content = []
for entry in rows:
if entry == 0:
if randint(1, cluster) == 1:
content.append(generate09())
else:
content.append(generatePLZ())
else:
content.append(generateRow(rowLength))
return content
# Function for generating the content of one single row randomly
def generateRow(rowLength):
row = ""
for z in range(0, rowLength):
row = row + str(randint(0, 9))
return row
row = ''
for z in range(0, rowLength):
row += str(randint(0, 9))
return row
# Function for writing data into a file (content = string, setcount and partstart are for better naming)
def writeFile(content, setcounter, partstart):
filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")
filenumber = int(setcounter) + int(partstart)
file = open('testdata/file' + str(filenumber) + '.txt', 'w')
for line in content:
file.write(line + '\n')
return True
if __name__ == '__main__':
# Getting the user input
print("Hello World")
units = int(input("How many units would you like to generate? "))
rows = int(input("How many rows should each unit have? "))
rowLength = int(input("How long should each row be? "))
cores = int(input("How many cores do you want to use? "))
cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/"))
# Getting the user input
print('Hello World')
units = int(input('How many units would you like to generate? '))
rows = int(input('How many rows should each unit have? '))
rowLength = int(input('How long should each row be? '))
cores = int(input('How many cores do you want to use? '))
cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/'))
# Splitting up the units
count = int(0)
partsize = units / cores
count = 0
partsize = units / cores
# For benchmarking starting the timer now
start_time = time.time()
# For benchmarking starting the timer now
start_time = time.time()
# Initialize and prepare cores for process
while count < cores:
partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster))
count = count + 1
# Initialize and prepare cores for process
while count < cores:
partstart = partsize * count
globals()['p' + str(count)] = multiprocessing.Process(
target=randomI,
args=(int(partsize), rows, rowLength, partstart, cluster)
)
count += 1
# Starting each core
count = int(0)
while count < cores:
globals()["p" + str(count)].start()
print("Core " + str(count) + " started.")
count = count + 1
# Starting each core
count = int(0)
while count < cores:
globals()['p' + str(count)].start()
print('Core ' + str(count) + ' started.')
count += 1
print("Working...")
print('Working...')
# Joining each core for the process
count = int(0)
while count < cores:
globals()["p" + str(count)].join()
count = count + 1
# Joining each core for the process
count = 0
while count < cores:
globals()['p' + str(count)].join()
count += 1
# Finishing up the process
sec = time.time() - start_time
print("Data is generated. Have fun!")
print("randomI took " + str(sec) + " seconds for execution.")
# Finishing up the process
sec = time.time() - start_time
print('Data is generated. Have fun!')
print('randomI took ' + str(sec) + ' seconds for execution.')