Compare commits

3 Commits

Author SHA1 Message Date
94ed193954 Update .gitignore 2019-04-25 12:45:05 +02:00
e2ad63f90f Code Rework 2019-04-25 12:43:37 +02:00
25fa068df9 kmeans ++ Update
- Added concept for kmeans ++
2018-06-02 23:31:43 +02:00
6 changed files with 276 additions and 255 deletions

1
.gitignore vendored
View File

@@ -1,2 +1,3 @@
testdata/
__pycache__/
.DS_Store

View File

@@ -4,36 +4,23 @@ def calcdiff(point1, point2):
difference = int(point2) - int(point1)
else:
difference = int(point1) - int(point2)
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
return betrag(difference)
return abs(difference)
# Get the absolute value of a number and returns it as int
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Determine the highest int value in an array and returns is as an int
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum
def pp_calcdiff(data, clusterpoint):
max_diff = 0
new_cluster = 0
for item in range(0,len(data)):
for item in range(0, len(data)):
if calcdiff(data[item], clusterpoint) > max_diff:
max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item]
return new_cluster
def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
max_diff = 0
new_cluster = 0
for item in range(0,len(data)):
for item in range(0, len(data)):
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item]

View File

@@ -1,6 +1,7 @@
# For random generation of numbers import randint
from random import randint, shuffle
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
def plzGen(entries):
dataArray = []
@@ -13,18 +14,20 @@ def plzGen(entries):
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
plz = generateNumber(plz_lenght, 4)
else:
plz = generateNumber(plz_lenght, randint(0,9))
plz = generateNumber(plz_lenght, randint(0, 9))
dataArray.append(plz)
shuffle(dataArray)
return dataArray
# Function for generating the content of one single row randomly
def generateNumber(numberLenght, startingNumber):
number = str(startingNumber)
for length in range(0, numberLenght - 1):
number = number + str(randint(0,9))
number = number + str(randint(0, 9))
return number
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
# /testdata/ folder has to be created at this point
def writeFile(content, nameChunkStart, namePartStart):
@@ -32,3 +35,4 @@ def writeFile(content, nameChunkStart, namePartStart):
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")

View File

@@ -1,19 +1,18 @@
#!/usr/bin/env python
#title: kmeansMkI.py
#description: Our personal Python K-Means++ implementation
#author: Tillmann Brendel, Conrad Großer
#license: Pending
#date: 26.05.2018
#version: 1.2
#usage: python pyscript.py
#notes:
#dependencies: mathplotlib
#known_issues:
#python_version: 3.x
#==============================================================================
# title: kmeansMkI.py
# description: Our personal Python K-Means++ implementation
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 26.05.2018
# version: 1.2
# usage: python pyscript.py
# notes:
# dependencies: mathplotlib
# known_issues:
# python_version: 3.x
# ==============================================================================
# IMPORTS
# Importing the time for benchmarking purposes
import time
from datetime import date
@@ -28,24 +27,29 @@ import matplotlib.pyplot as plt
import dmlib
import dmtest
# CODE
# Main function of the algorithm
def kmeansmk1(data, clusters):
globals()["cpoint_0"] = data[randint(0, len(data))]
globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
print("Initial cluster 1: " + str(globals()["cpoint_0"]))
print("Initial cluster 2: " + str(globals()["cpoint_1"]))
# Defining cluster points
for i in range(0, clusters):
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
for i in range(2, clusters):
globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Get max value in the data array
highPoint = dmlib.findHighest(data)
highPoint = max(data)
# Define variables for running the algorithm (runs is just for benchmarking!)
done = 0
runs = 0
done, runs = False, 0
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while done == 0:
runs = runs + 1
while not done:
runs += 1
new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters)
@@ -54,11 +58,11 @@ def kmeansmk1(data, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
anew = []
inew = 0
anew, inew = [], 0
while inew < len(data):
anew.append(inew)
inew = inew + 1
inew += 1
# Drawing found clusters as lines
for i in range(0, clusters):
@@ -70,9 +74,10 @@ def kmeansmk1(data, clusters):
return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters):
changed = 0
changed = False
for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)]
@@ -88,28 +93,29 @@ def calcClusters(data, clusters):
# Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = 1
changed = True
return changed
def assignCluster(data, highPoint, clusters):
# Create a new data array for working
new_data = []
new_data.append(data)
new_data = [data]
# Create new array for assigned clusters of each value
data_assigned = []
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, len(new_data[0])):
for item in data:
# Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters):
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)])
assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array
@@ -117,6 +123,7 @@ def assignCluster(data, highPoint, clusters):
return new_data
# Startup function for collecting necesarry data
def startup(data):
# Using two clusters for testing
@@ -134,6 +141,7 @@ def startup(data):
seconds = time.time() - start_time
print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test data
data = dmtest.plzGen(10000)

View File

@@ -10,31 +10,38 @@ from datetime import date
# Importing for multi core processing
import multiprocessing
# randomI function which creates each file
def randomI(units, rows, rowLength, partstart):
for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength), setcounter, partstart)
return True
# Function for generating the content of one single file
def generateFile(rows, rowLength):
content = []
for y in range(0, rows):
for entry in rows:
content.append(generateRow(rowLength))
return content
# Function for generating the content of one single row randomly
def generateRow(rowLength):
row = ""
for z in range(0, rowLength):
row = row + str(randint(0, 9))
row += str(randint(0, 9))
return row
# Function for writing data into a file
def writeFile(content, setcounter, partstart):
filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")
for line in content:
file.write(line + "\n")
return True
if __name__ == '__main__':
# Getting the user input
@@ -45,7 +52,7 @@ if __name__ == '__main__':
cores = int(input("How many cores do you want to use? "))
# Splitting up the units
count = int(0)
count = 0
partsize = units / cores
# For benchmarking starting the timer now
@@ -54,23 +61,26 @@ if __name__ == '__main__':
# Initialize and prepare cores for process
while count < cores:
partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart))
count = count + 1
globals()["p" + str(count)] = multiprocessing.Process(
target=randomI,
args=(int(partsize), rows, rowLength, partstart)
)
count += 1
# Starting each core
count = int(0)
count = 0
while count < cores:
globals()["p" + str(count)].start()
print("Core " + str(count) + " started.")
count = count + 1
count += 1
print("Working...")
# Joining each core for the process
count = int(0)
count = 0
while count < cores:
globals()["p" + str(count)].join()
count = count + 1
count += 1
# Finishing up the process
sec = time.time() - start_time

View File

@@ -1,15 +1,15 @@
#!/usr/bin/env python
#title: randomI2.1.py
#description: Personal
#author: Tillmann Brendel, Conrad Großer
#license: Pending
#date: 26.05.2018
#version: 1.0
#usage: python pyscript.py
#notes:
#known_issues:
#python_version: 3.x
#==============================================================================
# title: randomI2.1.py
# description: Personal
# author: Tillmann Brendel, Conrad Großer
# license: Pending
# date: 26.05.2018
# version: 1.0
# usage: python pyscript.py
# notes:
# known_issues:
# python_version: 3.x
# ==============================================================================
# For random generation of numbers import randint
from random import randint
@@ -21,17 +21,20 @@ from datetime import date
# Importing for multi core processing
import multiprocessing
# randomI function which creates each file
def randomI(units, rows, rowLength, partstart, cluster):
for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
return True
# Function for generating the content of one single file
def generateFile(rows, rowLength, cluster):
content = []
for y in range(0, rows):
if y == 0:
if 1 == randint(1, cluster):
for entry in rows:
if entry == 0:
if randint(1, cluster) == 1:
content.append(generate09())
else:
content.append(generatePLZ())
@@ -39,31 +42,36 @@ def generateFile(rows, rowLength, cluster):
content.append(generateRow(rowLength))
return content
# Function for generating the content of one single row randomly
def generateRow(rowLength):
row = ""
row = ''
for z in range(0, rowLength):
row = row + str(randint(0, 9))
row += str(randint(0, 9))
return row
# Function for writing data into a file (content = string, setcount and partstart are for better naming)
def writeFile(content, setcounter, partstart):
filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)):
file.write(content[w] + "\n")
file = open('testdata/file' + str(filenumber) + '.txt', 'w')
for line in content:
file.write(line + '\n')
return True
if __name__ == '__main__':
# Getting the user input
print("Hello World")
units = int(input("How many units would you like to generate? "))
rows = int(input("How many rows should each unit have? "))
rowLength = int(input("How long should each row be? "))
cores = int(input("How many cores do you want to use? "))
cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/"))
print('Hello World')
units = int(input('How many units would you like to generate? '))
rows = int(input('How many rows should each unit have? '))
rowLength = int(input('How long should each row be? '))
cores = int(input('How many cores do you want to use? '))
cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/'))
# Splitting up the units
count = int(0)
count = 0
partsize = units / cores
# For benchmarking starting the timer now
@@ -72,25 +80,28 @@ if __name__ == '__main__':
# Initialize and prepare cores for process
while count < cores:
partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster))
count = count + 1
globals()['p' + str(count)] = multiprocessing.Process(
target=randomI,
args=(int(partsize), rows, rowLength, partstart, cluster)
)
count += 1
# Starting each core
count = int(0)
while count < cores:
globals()["p" + str(count)].start()
print("Core " + str(count) + " started.")
count = count + 1
globals()['p' + str(count)].start()
print('Core ' + str(count) + ' started.')
count += 1
print("Working...")
print('Working...')
# Joining each core for the process
count = int(0)
count = 0
while count < cores:
globals()["p" + str(count)].join()
count = count + 1
globals()['p' + str(count)].join()
count += 1
# Finishing up the process
sec = time.time() - start_time
print("Data is generated. Have fun!")
print("randomI took " + str(sec) + " seconds for execution.")
print('Data is generated. Have fun!')
print('randomI took ' + str(sec) + ' seconds for execution.')