Compare commits

3 Commits

Author SHA1 Message Date
94ed193954 Update .gitignore 2019-04-25 12:45:05 +02:00
e2ad63f90f Code Rework 2019-04-25 12:43:37 +02:00
25fa068df9 kmeans ++ Update
- Added concept for kmeans ++
2018-06-02 23:31:43 +02:00
6 changed files with 276 additions and 255 deletions

1
.gitignore vendored
View File

@@ -1,2 +1,3 @@
testdata/ testdata/
__pycache__/ __pycache__/
.DS_Store

View File

@@ -4,36 +4,23 @@ def calcdiff(point1, point2):
difference = int(point2) - int(point1) difference = int(point2) - int(point1)
else: else:
difference = int(point1) - int(point2) difference = int(point1) - int(point2)
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference)) return abs(difference)
return betrag(difference)
# Get the absolute value of a number and returns it as int
def betrag(number):
if number < 0:
number = int((-2 * number) / 2)
return number
# Determine the highest int value in an array and returns is as an int
def findHighest(data):
maximum = 0
for i in range(0, len(data)):
if int(data[i]) > maximum:
maximum = int(data[i])
return maximum
def pp_calcdiff(data, clusterpoint): def pp_calcdiff(data, clusterpoint):
max_diff = 0 max_diff = 0
new_cluster = 0 new_cluster = 0
for item in range(0,len(data)): for item in range(0, len(data)):
if calcdiff(data[item], clusterpoint) > max_diff: if calcdiff(data[item], clusterpoint) > max_diff:
max_diff = calcdiff(data[item], clusterpoint) max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item] new_cluster = data[item]
return new_cluster return new_cluster
def pp_calcdiff_2(data, clusterpoint, clusterpoint_2): def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
max_diff = 0 max_diff = 0
new_cluster = 0 new_cluster = 0
for item in range(0,len(data)): for item in range(0, len(data)):
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff: if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
max_diff = calcdiff(data[item], clusterpoint) max_diff = calcdiff(data[item], clusterpoint)
new_cluster = data[item] new_cluster = data[item]

View File

@@ -1,6 +1,7 @@
# For random generation of numbers import randint # For random generation of numbers import randint
from random import randint, shuffle from random import randint, shuffle
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs # Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
def plzGen(entries): def plzGen(entries):
dataArray = [] dataArray = []
@@ -13,18 +14,20 @@ def plzGen(entries):
elif i >= round(entries * 0.6) and i < round(entries * 0.9): elif i >= round(entries * 0.6) and i < round(entries * 0.9):
plz = generateNumber(plz_lenght, 4) plz = generateNumber(plz_lenght, 4)
else: else:
plz = generateNumber(plz_lenght, randint(0,9)) plz = generateNumber(plz_lenght, randint(0, 9))
dataArray.append(plz) dataArray.append(plz)
shuffle(dataArray) shuffle(dataArray)
return dataArray return dataArray
# Function for generating the content of one single row randomly # Function for generating the content of one single row randomly
def generateNumber(numberLenght, startingNumber): def generateNumber(numberLenght, startingNumber):
number = str(startingNumber) number = str(startingNumber)
for length in range(0, numberLenght - 1): for length in range(0, numberLenght - 1):
number = number + str(randint(0,9)) number = number + str(randint(0, 9))
return number return number
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming) # Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
# /testdata/ folder has to be created at this point # /testdata/ folder has to be created at this point
def writeFile(content, nameChunkStart, namePartStart): def writeFile(content, nameChunkStart, namePartStart):
@@ -32,3 +35,4 @@ def writeFile(content, nameChunkStart, namePartStart):
file = open("testdata/file" + str(filenumber) + ".txt", "w") file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)): for w in range(0, len(content)):
file.write(content[w] + "\n") file.write(content[w] + "\n")

View File

@@ -1,19 +1,18 @@
#!/usr/bin/env python #!/usr/bin/env python
#title: kmeansMkI.py # title: kmeansMkI.py
#description: Our personal Python K-Means++ implementation # description: Our personal Python K-Means++ implementation
#author: Tillmann Brendel, Conrad Großer # author: Tillmann Brendel, Conrad Großer
#license: Pending # license: Pending
#date: 26.05.2018 # date: 26.05.2018
#version: 1.2 # version: 1.2
#usage: python pyscript.py # usage: python pyscript.py
#notes: # notes:
#dependencies: mathplotlib # dependencies: mathplotlib
#known_issues: # known_issues:
#python_version: 3.x # python_version: 3.x
#============================================================================== # ==============================================================================
# IMPORTS # IMPORTS
# Importing the time for benchmarking purposes # Importing the time for benchmarking purposes
import time import time
from datetime import date from datetime import date
@@ -28,24 +27,29 @@ import matplotlib.pyplot as plt
import dmlib import dmlib
import dmtest import dmtest
# CODE
# Main function of the algorithm # Main function of the algorithm
def kmeansmk1(data, clusters): def kmeansmk1(data, clusters):
globals()["cpoint_0"] = data[randint(0, len(data))]
globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
print("Initial cluster 1: " + str(globals()["cpoint_0"]))
print("Initial cluster 2: " + str(globals()["cpoint_1"]))
# Defining cluster points # Defining cluster points
for i in range(0, clusters): for i in range(2, clusters):
globals()["cpoint_" + str(i)] = data[randint(0, len(data))] globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)])) print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
# Get max value in the data array # Get max value in the data array
highPoint = dmlib.findHighest(data) highPoint = max(data)
# Define variables for running the algorithm (runs is just for benchmarking!) # Define variables for running the algorithm (runs is just for benchmarking!)
done = 0 done, runs = False, 0
runs = 0
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters # As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
while done == 0: while not done:
runs = runs + 1 runs += 1
new_data = assignCluster(data, highPoint, clusters) new_data = assignCluster(data, highPoint, clusters)
done = calcClusters(new_data, clusters) done = calcClusters(new_data, clusters)
@@ -54,11 +58,11 @@ def kmeansmk1(data, clusters):
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs") print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data # Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
anew = [] anew, inew = [], 0
inew = 0
while inew < len(data): while inew < len(data):
anew.append(inew) anew.append(inew)
inew = inew + 1 inew += 1
# Drawing found clusters as lines # Drawing found clusters as lines
for i in range(0, clusters): for i in range(0, clusters):
@@ -70,9 +74,10 @@ def kmeansmk1(data, clusters):
return 0 return 0
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster) # Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
def calcClusters(data, clusters): def calcClusters(data, clusters):
changed = 0 changed = False
for cluster in range(0, clusters): for cluster in range(0, clusters):
# Getting current cluster and saving it in temporary variable # Getting current cluster and saving it in temporary variable
prev_cluster = globals()["cpoint_" + str(cluster)] prev_cluster = globals()["cpoint_" + str(cluster)]
@@ -88,28 +93,29 @@ def calcClusters(data, clusters):
# Checking if previous clusterpoint is equal to the one just calculated # Checking if previous clusterpoint is equal to the one just calculated
if prev_cluster == globals()["cpoint_" + str(cluster)]: if prev_cluster == globals()["cpoint_" + str(cluster)]:
changed = 1 changed = True
return changed return changed
def assignCluster(data, highPoint, clusters): def assignCluster(data, highPoint, clusters):
# Create a new data array for working # Create a new data array for working
new_data = [] new_data = [data]
new_data.append(data)
# Create new array for assigned clusters of each value # Create new array for assigned clusters of each value
data_assigned = [] data_assigned = []
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index]) # For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
for item in range(0, len(new_data[0])): for item in data:
# Set the minimal cluster difference to the highest difference in the list to ease comparision # Set the minimal cluster difference to the highest difference in the list to ease comparision
min_cluster = highPoint min_cluster = highPoint
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference # Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
for cluster in range(0, clusters): for cluster in range(0, clusters):
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]): if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]):
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]) min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)])
assinged_cluster = globals()["cpoint_" + str(cluster)] assinged_cluster = globals()["cpoint_" + str(cluster)]
# Assign the minimal difference cluster to the data # Assign the minimal difference cluster to the data
data_assigned.append(assinged_cluster) data_assigned.append(assinged_cluster)
# Add the assigned values list to the new_data array # Add the assigned values list to the new_data array
@@ -117,6 +123,7 @@ def assignCluster(data, highPoint, clusters):
return new_data return new_data
# Startup function for collecting necesarry data # Startup function for collecting necesarry data
def startup(data): def startup(data):
# Using two clusters for testing # Using two clusters for testing
@@ -134,6 +141,7 @@ def startup(data):
seconds = time.time() - start_time seconds = time.time() - start_time
print(str(seconds) + " seconds for execution") print(str(seconds) + " seconds for execution")
# Start the algorithm and generate test data # Start the algorithm and generate test data
data = dmtest.plzGen(10000) data = dmtest.plzGen(10000)

View File

@@ -10,31 +10,38 @@ from datetime import date
# Importing for multi core processing # Importing for multi core processing
import multiprocessing import multiprocessing
# randomI function which creates each file # randomI function which creates each file
def randomI(units, rows, rowLength, partstart): def randomI(units, rows, rowLength, partstart):
for setcounter in range(0, units): for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength), setcounter, partstart) writeFile(generateFile(rows, rowLength), setcounter, partstart)
return True
# Function for generating the content of one single file # Function for generating the content of one single file
def generateFile(rows, rowLength): def generateFile(rows, rowLength):
content = [] content = []
for y in range(0, rows): for entry in rows:
content.append(generateRow(rowLength)) content.append(generateRow(rowLength))
return content return content
# Function for generating the content of one single row randomly # Function for generating the content of one single row randomly
def generateRow(rowLength): def generateRow(rowLength):
row = "" row = ""
for z in range(0, rowLength): for z in range(0, rowLength):
row = row + str(randint(0, 9)) row += str(randint(0, 9))
return row return row
# Function for writing data into a file # Function for writing data into a file
def writeFile(content, setcounter, partstart): def writeFile(content, setcounter, partstart):
filenumber = int(setcounter) + int(partstart) filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w") file = open("testdata/file" + str(filenumber) + ".txt", "w")
for w in range(0, len(content)): for line in content:
file.write(content[w] + "\n") file.write(line + "\n")
return True
if __name__ == '__main__': if __name__ == '__main__':
# Getting the user input # Getting the user input
@@ -45,7 +52,7 @@ if __name__ == '__main__':
cores = int(input("How many cores do you want to use? ")) cores = int(input("How many cores do you want to use? "))
# Splitting up the units # Splitting up the units
count = int(0) count = 0
partsize = units / cores partsize = units / cores
# For benchmarking starting the timer now # For benchmarking starting the timer now
@@ -54,23 +61,26 @@ if __name__ == '__main__':
# Initialize and prepare cores for process # Initialize and prepare cores for process
while count < cores: while count < cores:
partstart = partsize * count partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart)) globals()["p" + str(count)] = multiprocessing.Process(
count = count + 1 target=randomI,
args=(int(partsize), rows, rowLength, partstart)
)
count += 1
# Starting each core # Starting each core
count = int(0) count = 0
while count < cores: while count < cores:
globals()["p" + str(count)].start() globals()["p" + str(count)].start()
print("Core " + str(count) + " started.") print("Core " + str(count) + " started.")
count = count + 1 count += 1
print("Working...") print("Working...")
# Joining each core for the process # Joining each core for the process
count = int(0) count = 0
while count < cores: while count < cores:
globals()["p" + str(count)].join() globals()["p" + str(count)].join()
count = count + 1 count += 1
# Finishing up the process # Finishing up the process
sec = time.time() - start_time sec = time.time() - start_time

View File

@@ -1,15 +1,15 @@
#!/usr/bin/env python #!/usr/bin/env python
#title: randomI2.1.py # title: randomI2.1.py
#description: Personal # description: Personal
#author: Tillmann Brendel, Conrad Großer # author: Tillmann Brendel, Conrad Großer
#license: Pending # license: Pending
#date: 26.05.2018 # date: 26.05.2018
#version: 1.0 # version: 1.0
#usage: python pyscript.py # usage: python pyscript.py
#notes: # notes:
#known_issues: # known_issues:
#python_version: 3.x # python_version: 3.x
#============================================================================== # ==============================================================================
# For random generation of numbers import randint # For random generation of numbers import randint
from random import randint from random import randint
@@ -21,17 +21,20 @@ from datetime import date
# Importing for multi core processing # Importing for multi core processing
import multiprocessing import multiprocessing
# randomI function which creates each file # randomI function which creates each file
def randomI(units, rows, rowLength, partstart, cluster): def randomI(units, rows, rowLength, partstart, cluster):
for setcounter in range(0, units): for setcounter in range(0, units):
writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart) writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
return True
# Function for generating the content of one single file # Function for generating the content of one single file
def generateFile(rows, rowLength, cluster): def generateFile(rows, rowLength, cluster):
content = [] content = []
for y in range(0, rows): for entry in rows:
if y == 0: if entry == 0:
if 1 == randint(1, cluster): if randint(1, cluster) == 1:
content.append(generate09()) content.append(generate09())
else: else:
content.append(generatePLZ()) content.append(generatePLZ())
@@ -39,31 +42,36 @@ def generateFile(rows, rowLength, cluster):
content.append(generateRow(rowLength)) content.append(generateRow(rowLength))
return content return content
# Function for generating the content of one single row randomly # Function for generating the content of one single row randomly
def generateRow(rowLength): def generateRow(rowLength):
row = "" row = ''
for z in range(0, rowLength): for z in range(0, rowLength):
row = row + str(randint(0, 9)) row += str(randint(0, 9))
return row return row
# Function for writing data into a file (content = string, setcount and partstart are for better naming) # Function for writing data into a file (content = string, setcount and partstart are for better naming)
def writeFile(content, setcounter, partstart): def writeFile(content, setcounter, partstart):
filenumber = int(setcounter) + int(partstart) filenumber = int(setcounter) + int(partstart)
file = open("testdata/file" + str(filenumber) + ".txt", "w") file = open('testdata/file' + str(filenumber) + '.txt', 'w')
for w in range(0, len(content)):
file.write(content[w] + "\n") for line in content:
file.write(line + '\n')
return True
if __name__ == '__main__': if __name__ == '__main__':
# Getting the user input # Getting the user input
print("Hello World") print('Hello World')
units = int(input("How many units would you like to generate? ")) units = int(input('How many units would you like to generate? '))
rows = int(input("How many rows should each unit have? ")) rows = int(input('How many rows should each unit have? '))
rowLength = int(input("How long should each row be? ")) rowLength = int(input('How long should each row be? '))
cores = int(input("How many cores do you want to use? ")) cores = int(input('How many cores do you want to use? '))
cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/")) cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/'))
# Splitting up the units # Splitting up the units
count = int(0) count = 0
partsize = units / cores partsize = units / cores
# For benchmarking starting the timer now # For benchmarking starting the timer now
@@ -72,25 +80,28 @@ if __name__ == '__main__':
# Initialize and prepare cores for process # Initialize and prepare cores for process
while count < cores: while count < cores:
partstart = partsize * count partstart = partsize * count
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster)) globals()['p' + str(count)] = multiprocessing.Process(
count = count + 1 target=randomI,
args=(int(partsize), rows, rowLength, partstart, cluster)
)
count += 1
# Starting each core # Starting each core
count = int(0) count = int(0)
while count < cores: while count < cores:
globals()["p" + str(count)].start() globals()['p' + str(count)].start()
print("Core " + str(count) + " started.") print('Core ' + str(count) + ' started.')
count = count + 1 count += 1
print("Working...") print('Working...')
# Joining each core for the process # Joining each core for the process
count = int(0) count = 0
while count < cores: while count < cores:
globals()["p" + str(count)].join() globals()['p' + str(count)].join()
count = count + 1 count += 1
# Finishing up the process # Finishing up the process
sec = time.time() - start_time sec = time.time() - start_time
print("Data is generated. Have fun!") print('Data is generated. Have fun!')
print("randomI took " + str(sec) + " seconds for execution.") print('randomI took ' + str(sec) + ' seconds for execution.')