Compare commits
3 Commits
master
...
dev_kmeans
| Author | SHA1 | Date | |
|---|---|---|---|
| 94ed193954 | |||
| e2ad63f90f | |||
| 25fa068df9 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,2 +1,3 @@
|
|||||||
testdata/
|
testdata/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
.DS_Store
|
||||||
|
|||||||
@@ -1,40 +1,27 @@
|
|||||||
# Calculate the difference between two points giving the indexes of these data entries
|
# Calculate the difference between two points giving the indexes of these data entries
|
||||||
def calcdiff(point1, point2):
|
def calcdiff(point1, point2):
|
||||||
if int(point2) > int(point1):
|
if int(point2) > int(point1):
|
||||||
difference = int(point2) - int(point1)
|
difference = int(point2) - int(point1)
|
||||||
else:
|
else:
|
||||||
difference = int(point1) - int(point2)
|
difference = int(point1) - int(point2)
|
||||||
# print("Datapoint: " + str(data[point1]) + " | Cluster: " + str(data[point2]) + " | Difference: " + str(difference))
|
return abs(difference)
|
||||||
return betrag(difference)
|
|
||||||
|
|
||||||
# Get the absolute value of a number and returns it as int
|
|
||||||
def betrag(number):
|
|
||||||
if number < 0:
|
|
||||||
number = int((-2 * number) / 2)
|
|
||||||
return number
|
|
||||||
|
|
||||||
# Determine the highest int value in an array and returns is as an int
|
|
||||||
def findHighest(data):
|
|
||||||
maximum = 0
|
|
||||||
for i in range(0, len(data)):
|
|
||||||
if int(data[i]) > maximum:
|
|
||||||
maximum = int(data[i])
|
|
||||||
return maximum
|
|
||||||
|
|
||||||
def pp_calcdiff(data, clusterpoint):
|
def pp_calcdiff(data, clusterpoint):
|
||||||
max_diff = 0
|
max_diff = 0
|
||||||
new_cluster = 0
|
new_cluster = 0
|
||||||
for item in range(0,len(data)):
|
for item in range(0, len(data)):
|
||||||
if calcdiff(data[item], clusterpoint) > max_diff:
|
if calcdiff(data[item], clusterpoint) > max_diff:
|
||||||
max_diff = calcdiff(data[item], clusterpoint)
|
max_diff = calcdiff(data[item], clusterpoint)
|
||||||
new_cluster = data[item]
|
new_cluster = data[item]
|
||||||
return new_cluster
|
return new_cluster
|
||||||
|
|
||||||
|
|
||||||
def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
|
def pp_calcdiff_2(data, clusterpoint, clusterpoint_2):
|
||||||
max_diff = 0
|
max_diff = 0
|
||||||
new_cluster = 0
|
new_cluster = 0
|
||||||
for item in range(0,len(data)):
|
for item in range(0, len(data)):
|
||||||
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
|
if calcdiff(data[item], clusterpoint) + calcdiff(data[item], clusterpoint_2) > max_diff:
|
||||||
max_diff = calcdiff(data[item], clusterpoint)
|
max_diff = calcdiff(data[item], clusterpoint)
|
||||||
new_cluster = data[item]
|
new_cluster = data[item]
|
||||||
return new_cluster
|
return new_cluster
|
||||||
|
|||||||
@@ -1,34 +1,38 @@
|
|||||||
# For random generation of numbers import randint
|
# For random generation of numbers import randint
|
||||||
from random import randint, shuffle
|
from random import randint, shuffle
|
||||||
|
|
||||||
|
|
||||||
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
|
# Simple generator for test plzs (40-40-20 biased), returns 1D array of plzs
|
||||||
def plzGen(entries):
|
def plzGen(entries):
|
||||||
dataArray = []
|
dataArray = []
|
||||||
plz_lenght = 5
|
plz_lenght = 5
|
||||||
for i in range(0, int(entries)):
|
for i in range(0, int(entries)):
|
||||||
if i < round(entries * 0.4):
|
if i < round(entries * 0.4):
|
||||||
plz = generateNumber(plz_lenght, 2)
|
plz = generateNumber(plz_lenght, 2)
|
||||||
elif i >= round(entries * 0.4) and i < round(entries * 0.6):
|
elif i >= round(entries * 0.4) and i < round(entries * 0.6):
|
||||||
plz = generateNumber(plz_lenght, 9)
|
plz = generateNumber(plz_lenght, 9)
|
||||||
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
|
elif i >= round(entries * 0.6) and i < round(entries * 0.9):
|
||||||
plz = generateNumber(plz_lenght, 4)
|
plz = generateNumber(plz_lenght, 4)
|
||||||
else:
|
else:
|
||||||
plz = generateNumber(plz_lenght, randint(0,9))
|
plz = generateNumber(plz_lenght, randint(0, 9))
|
||||||
dataArray.append(plz)
|
dataArray.append(plz)
|
||||||
shuffle(dataArray)
|
shuffle(dataArray)
|
||||||
return dataArray
|
return dataArray
|
||||||
|
|
||||||
|
|
||||||
# Function for generating the content of one single row randomly
|
# Function for generating the content of one single row randomly
|
||||||
def generateNumber(numberLenght, startingNumber):
|
def generateNumber(numberLenght, startingNumber):
|
||||||
number = str(startingNumber)
|
number = str(startingNumber)
|
||||||
for length in range(0, numberLenght - 1):
|
for length in range(0, numberLenght - 1):
|
||||||
number = number + str(randint(0,9))
|
number = number + str(randint(0, 9))
|
||||||
return number
|
return number
|
||||||
|
|
||||||
|
|
||||||
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
|
# Function for writing data into a file (content = string, nameChunkStart and namePartStart are for better naming)
|
||||||
# /testdata/ folder has to be created at this point
|
# /testdata/ folder has to be created at this point
|
||||||
def writeFile(content, nameChunkStart, namePartStart):
|
def writeFile(content, nameChunkStart, namePartStart):
|
||||||
filenumber = int(nameChunkStart) + int(namePartStart)
|
filenumber = int(nameChunkStart) + int(namePartStart)
|
||||||
file = open("testdata/file" + str(filenumber) + ".txt", "w")
|
file = open("testdata/file" + str(filenumber) + ".txt", "w")
|
||||||
for w in range(0, len(content)):
|
for w in range(0, len(content)):
|
||||||
file.write(content[w] + "\n")
|
file.write(content[w] + "\n")
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,18 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
#title: kmeansMkI.py
|
# title: kmeansMkI.py
|
||||||
#description: Our personal Python K-Means++ implementation
|
# description: Our personal Python K-Means++ implementation
|
||||||
#author: Tillmann Brendel, Conrad Großer
|
# author: Tillmann Brendel, Conrad Großer
|
||||||
#license: Pending
|
# license: Pending
|
||||||
#date: 26.05.2018
|
# date: 26.05.2018
|
||||||
#version: 1.2
|
# version: 1.2
|
||||||
#usage: python pyscript.py
|
# usage: python pyscript.py
|
||||||
#notes:
|
# notes:
|
||||||
#dependencies: mathplotlib
|
# dependencies: mathplotlib
|
||||||
#known_issues:
|
# known_issues:
|
||||||
#python_version: 3.x
|
# python_version: 3.x
|
||||||
#==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# IMPORTS
|
# IMPORTS
|
||||||
|
|
||||||
# Importing the time for benchmarking purposes
|
# Importing the time for benchmarking purposes
|
||||||
import time
|
import time
|
||||||
from datetime import date
|
from datetime import date
|
||||||
@@ -28,111 +27,120 @@ import matplotlib.pyplot as plt
|
|||||||
import dmlib
|
import dmlib
|
||||||
import dmtest
|
import dmtest
|
||||||
|
|
||||||
# CODE
|
|
||||||
# Main function of the algorithm
|
# Main function of the algorithm
|
||||||
def kmeansmk1(data, clusters):
|
def kmeansmk1(data, clusters):
|
||||||
# Defining cluster points
|
globals()["cpoint_0"] = data[randint(0, len(data))]
|
||||||
for i in range(0, clusters):
|
globals()["cpoint_1"] = dmlib.pp_calcdiff(data, globals()["cpoint_0"])
|
||||||
globals()["cpoint_" + str(i)] = data[randint(0, len(data))]
|
|
||||||
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
|
||||||
|
|
||||||
# Get max value in the data array
|
print("Initial cluster 1: " + str(globals()["cpoint_0"]))
|
||||||
highPoint = dmlib.findHighest(data)
|
print("Initial cluster 2: " + str(globals()["cpoint_1"]))
|
||||||
|
|
||||||
# Define variables for running the algorithm (runs is just for benchmarking!)
|
# Defining cluster points
|
||||||
done = 0
|
for i in range(2, clusters):
|
||||||
runs = 0
|
globals()["cpoint_" + str(i)] = dmlib.pp_calcdiff_2(data, globals()["cpoint_" + str(i - 1)], globals()["cpoint_" + str(i - 2)])
|
||||||
|
print("Initial cluster " + str(i + 1) + ": " + str(globals()["cpoint_" + str(i)]))
|
||||||
|
|
||||||
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
|
# Get max value in the data array
|
||||||
while done == 0:
|
highPoint = max(data)
|
||||||
runs = runs + 1
|
|
||||||
new_data = assignCluster(data, highPoint, clusters)
|
|
||||||
done = calcClusters(new_data, clusters)
|
|
||||||
|
|
||||||
# Printing final clusters
|
# Define variables for running the algorithm (runs is just for benchmarking!)
|
||||||
for i in range(0, clusters):
|
done, runs = False, 0
|
||||||
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
|
||||||
|
|
||||||
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
|
# As long as calcClusters returns done it will rearange the clusters and assign the data to the clusters
|
||||||
anew = []
|
while not done:
|
||||||
inew = 0
|
runs += 1
|
||||||
while inew < len(data):
|
new_data = assignCluster(data, highPoint, clusters)
|
||||||
anew.append(inew)
|
done = calcClusters(new_data, clusters)
|
||||||
inew = inew + 1
|
|
||||||
|
|
||||||
# Drawing found clusters as lines
|
# Printing final clusters
|
||||||
for i in range(0, clusters):
|
for i in range(0, clusters):
|
||||||
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
|
print("Endcluster " + str(i + 1) + " is calculated to be at " + str(globals()["cpoint_" + str(i)]) + " after " + str(runs) + " runs")
|
||||||
|
|
||||||
# Showing graph
|
# Getting artificial array for visualizing 1D data in an 2D graphic of the size of the original data
|
||||||
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
|
anew, inew = [], 0
|
||||||
plt.show()
|
|
||||||
|
while inew < len(data):
|
||||||
|
anew.append(inew)
|
||||||
|
inew += 1
|
||||||
|
|
||||||
|
# Drawing found clusters as lines
|
||||||
|
for i in range(0, clusters):
|
||||||
|
plt.axvline(x=int(globals()["cpoint_" + str(i)]), color='r')
|
||||||
|
|
||||||
|
# Showing graph
|
||||||
|
plt.scatter([int(x) for x in data], anew, marker='x', s=7, color='k')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
# Calculates middle values for each cluster, takes 2D array (item, assigned_cluster)
|
||||||
def calcClusters(data, clusters):
|
def calcClusters(data, clusters):
|
||||||
changed = 0
|
changed = False
|
||||||
for cluster in range(0, clusters):
|
for cluster in range(0, clusters):
|
||||||
# Getting current cluster and saving it in temporary variable
|
# Getting current cluster and saving it in temporary variable
|
||||||
prev_cluster = globals()["cpoint_" + str(cluster)]
|
prev_cluster = globals()["cpoint_" + str(cluster)]
|
||||||
# Sum of the cluster to calculate average difference between cluster center and data points
|
# Sum of the cluster to calculate average difference between cluster center and data points
|
||||||
clustersum = 0
|
clustersum = 0
|
||||||
item_count = 0
|
item_count = 0
|
||||||
|
|
||||||
for item in range(0, len(data[0])):
|
for item in range(0, len(data[0])):
|
||||||
if data[1][item] == globals()["cpoint_" + str(cluster)]:
|
if data[1][item] == globals()["cpoint_" + str(cluster)]:
|
||||||
clustersum = clustersum + int(data[0][item])
|
clustersum = clustersum + int(data[0][item])
|
||||||
item_count = item_count + 1
|
item_count = item_count + 1
|
||||||
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
|
globals()["cpoint_" + str(cluster)] = round(clustersum / item_count)
|
||||||
|
|
||||||
# Checking if previous clusterpoint is equal to the one just calculated
|
# Checking if previous clusterpoint is equal to the one just calculated
|
||||||
if prev_cluster == globals()["cpoint_" + str(cluster)]:
|
if prev_cluster == globals()["cpoint_" + str(cluster)]:
|
||||||
changed = 1
|
changed = True
|
||||||
|
|
||||||
|
return changed
|
||||||
|
|
||||||
return changed
|
|
||||||
|
|
||||||
def assignCluster(data, highPoint, clusters):
|
def assignCluster(data, highPoint, clusters):
|
||||||
# Create a new data array for working
|
# Create a new data array for working
|
||||||
new_data = []
|
new_data = [data]
|
||||||
new_data.append(data)
|
|
||||||
|
|
||||||
# Create new array for assigned clusters of each value
|
# Create new array for assigned clusters of each value
|
||||||
data_assigned = []
|
data_assigned = []
|
||||||
|
|
||||||
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
|
# For each item in data find the minimal difference to a cluster and write it in the new data array in the second place (new_data[item][cluster_index])
|
||||||
for item in range(0, len(new_data[0])):
|
for item in data:
|
||||||
# Set the minimal cluster difference to the highest difference in the list to ease comparision
|
# Set the minimal cluster difference to the highest difference in the list to ease comparision
|
||||||
min_cluster = highPoint
|
min_cluster = highPoint
|
||||||
|
|
||||||
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
|
# Check the difference between the point (item) and each cluster and set min_cluster to the smallest difference
|
||||||
for cluster in range(0, clusters):
|
for cluster in range(0, clusters):
|
||||||
if min_cluster > dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)]):
|
if int(min_cluster) > dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)]):
|
||||||
min_cluster = dmlib.calcdiff(data[item], globals()["cpoint_" + str(cluster)])
|
min_cluster = dmlib.calcdiff(item, globals()["cpoint_" + str(cluster)])
|
||||||
assinged_cluster = globals()["cpoint_" + str(cluster)]
|
assinged_cluster = globals()["cpoint_" + str(cluster)]
|
||||||
# Assign the minimal difference cluster to the data
|
|
||||||
data_assigned.append(assinged_cluster)
|
# Assign the minimal difference cluster to the data
|
||||||
# Add the assigned values list to the new_data array
|
data_assigned.append(assinged_cluster)
|
||||||
new_data.append(data_assigned)
|
# Add the assigned values list to the new_data array
|
||||||
|
new_data.append(data_assigned)
|
||||||
|
|
||||||
|
return new_data
|
||||||
|
|
||||||
return new_data
|
|
||||||
|
|
||||||
# Startup function for collecting necesarry data
|
# Startup function for collecting necesarry data
|
||||||
def startup(data):
|
def startup(data):
|
||||||
# Using two clusters for testing
|
# Using two clusters for testing
|
||||||
clusters = int(input("How many clusters are known? "))
|
clusters = int(input("How many clusters are known? "))
|
||||||
# cores = input("How many cores should be used? ")
|
# cores = input("How many cores should be used? ")
|
||||||
# path = input("Where is the data? ") or in this case data
|
# path = input("Where is the data? ") or in this case data
|
||||||
|
|
||||||
# For benchmarking starting the timer now
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
# Firing up the engines!
|
# For benchmarking starting the timer now
|
||||||
kmeansmk1(data, clusters)
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Firing up the engines!
|
||||||
|
kmeansmk1(data, clusters)
|
||||||
|
|
||||||
|
# Stopping benchmark
|
||||||
|
seconds = time.time() - start_time
|
||||||
|
print(str(seconds) + " seconds for execution")
|
||||||
|
|
||||||
# Stopping benchmark
|
|
||||||
seconds = time.time() - start_time
|
|
||||||
print(str(seconds) + " seconds for execution")
|
|
||||||
|
|
||||||
# Start the algorithm and generate test data
|
# Start the algorithm and generate test data
|
||||||
data = dmtest.plzGen(10000)
|
data = dmtest.plzGen(10000)
|
||||||
|
|||||||
@@ -10,69 +10,79 @@ from datetime import date
|
|||||||
# Importing for multi core processing
|
# Importing for multi core processing
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
||||||
|
|
||||||
# randomI function which creates each file
|
# randomI function which creates each file
|
||||||
def randomI(units, rows, rowLength, partstart):
|
def randomI(units, rows, rowLength, partstart):
|
||||||
for setcounter in range(0, units):
|
for setcounter in range(0, units):
|
||||||
writeFile(generateFile(rows, rowLength), setcounter, partstart)
|
writeFile(generateFile(rows, rowLength), setcounter, partstart)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
# Function for generating the content of one single file
|
# Function for generating the content of one single file
|
||||||
def generateFile(rows, rowLength):
|
def generateFile(rows, rowLength):
|
||||||
content = []
|
content = []
|
||||||
for y in range(0, rows):
|
for entry in rows:
|
||||||
content.append(generateRow(rowLength))
|
content.append(generateRow(rowLength))
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
# Function for generating the content of one single row randomly
|
# Function for generating the content of one single row randomly
|
||||||
def generateRow(rowLength):
|
def generateRow(rowLength):
|
||||||
row = ""
|
row = ""
|
||||||
for z in range(0, rowLength):
|
for z in range(0, rowLength):
|
||||||
row = row + str(randint(0, 9))
|
row += str(randint(0, 9))
|
||||||
return row
|
return row
|
||||||
|
|
||||||
|
|
||||||
# Function for writing data into a file
|
# Function for writing data into a file
|
||||||
def writeFile(content, setcounter, partstart):
|
def writeFile(content, setcounter, partstart):
|
||||||
filenumber = int(setcounter) + int(partstart)
|
filenumber = int(setcounter) + int(partstart)
|
||||||
file = open("testdata/file" + str(filenumber) + ".txt", "w")
|
file = open("testdata/file" + str(filenumber) + ".txt", "w")
|
||||||
for w in range(0, len(content)):
|
for line in content:
|
||||||
file.write(content[w] + "\n")
|
file.write(line + "\n")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Getting the user input
|
# Getting the user input
|
||||||
print("Hello World")
|
print("Hello World")
|
||||||
units = int(input("How many units would you like to generate? "))
|
units = int(input("How many units would you like to generate? "))
|
||||||
rows = int(input("How many rows should each unit have? "))
|
rows = int(input("How many rows should each unit have? "))
|
||||||
rowLength = int(input("How long should each row be? "))
|
rowLength = int(input("How long should each row be? "))
|
||||||
cores = int(input("How many cores do you want to use? "))
|
cores = int(input("How many cores do you want to use? "))
|
||||||
|
|
||||||
# Splitting up the units
|
# Splitting up the units
|
||||||
count = int(0)
|
count = 0
|
||||||
partsize = units / cores
|
partsize = units / cores
|
||||||
|
|
||||||
# For benchmarking starting the timer now
|
# For benchmarking starting the timer now
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Initialize and prepare cores for process
|
# Initialize and prepare cores for process
|
||||||
while count < cores:
|
while count < cores:
|
||||||
partstart = partsize * count
|
partstart = partsize * count
|
||||||
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart))
|
globals()["p" + str(count)] = multiprocessing.Process(
|
||||||
count = count + 1
|
target=randomI,
|
||||||
|
args=(int(partsize), rows, rowLength, partstart)
|
||||||
|
)
|
||||||
|
count += 1
|
||||||
|
|
||||||
# Starting each core
|
# Starting each core
|
||||||
count = int(0)
|
count = 0
|
||||||
while count < cores:
|
while count < cores:
|
||||||
globals()["p" + str(count)].start()
|
globals()["p" + str(count)].start()
|
||||||
print("Core " + str(count) + " started.")
|
print("Core " + str(count) + " started.")
|
||||||
count = count + 1
|
count += 1
|
||||||
|
|
||||||
print("Working...")
|
print("Working...")
|
||||||
|
|
||||||
# Joining each core for the process
|
# Joining each core for the process
|
||||||
count = int(0)
|
count = 0
|
||||||
while count < cores:
|
while count < cores:
|
||||||
globals()["p" + str(count)].join()
|
globals()["p" + str(count)].join()
|
||||||
count = count + 1
|
count += 1
|
||||||
|
|
||||||
# Finishing up the process
|
# Finishing up the process
|
||||||
sec = time.time() - start_time
|
sec = time.time() - start_time
|
||||||
print("Data is generated. Have fun!")
|
print("Data is generated. Have fun!")
|
||||||
print("randomI took " + str(sec) + " seconds for execution.")
|
print("randomI took " + str(sec) + " seconds for execution.")
|
||||||
|
|||||||
@@ -1,15 +1,15 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
#title: randomI2.1.py
|
# title: randomI2.1.py
|
||||||
#description: Personal
|
# description: Personal
|
||||||
#author: Tillmann Brendel, Conrad Großer
|
# author: Tillmann Brendel, Conrad Großer
|
||||||
#license: Pending
|
# license: Pending
|
||||||
#date: 26.05.2018
|
# date: 26.05.2018
|
||||||
#version: 1.0
|
# version: 1.0
|
||||||
#usage: python pyscript.py
|
# usage: python pyscript.py
|
||||||
#notes:
|
# notes:
|
||||||
#known_issues:
|
# known_issues:
|
||||||
#python_version: 3.x
|
# python_version: 3.x
|
||||||
#==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# For random generation of numbers import randint
|
# For random generation of numbers import randint
|
||||||
from random import randint
|
from random import randint
|
||||||
@@ -21,76 +21,87 @@ from datetime import date
|
|||||||
# Importing for multi core processing
|
# Importing for multi core processing
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
||||||
|
|
||||||
# randomI function which creates each file
|
# randomI function which creates each file
|
||||||
def randomI(units, rows, rowLength, partstart, cluster):
|
def randomI(units, rows, rowLength, partstart, cluster):
|
||||||
for setcounter in range(0, units):
|
for setcounter in range(0, units):
|
||||||
writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
|
writeFile(generateFile(rows, rowLength, cluster), setcounter, partstart)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
# Function for generating the content of one single file
|
# Function for generating the content of one single file
|
||||||
def generateFile(rows, rowLength, cluster):
|
def generateFile(rows, rowLength, cluster):
|
||||||
content = []
|
content = []
|
||||||
for y in range(0, rows):
|
for entry in rows:
|
||||||
if y == 0:
|
if entry == 0:
|
||||||
if 1 == randint(1, cluster):
|
if randint(1, cluster) == 1:
|
||||||
content.append(generate09())
|
content.append(generate09())
|
||||||
else:
|
else:
|
||||||
content.append(generatePLZ())
|
content.append(generatePLZ())
|
||||||
else:
|
else:
|
||||||
content.append(generateRow(rowLength))
|
content.append(generateRow(rowLength))
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
# Function for generating the content of one single row randomly
|
# Function for generating the content of one single row randomly
|
||||||
def generateRow(rowLength):
|
def generateRow(rowLength):
|
||||||
row = ""
|
row = ''
|
||||||
for z in range(0, rowLength):
|
for z in range(0, rowLength):
|
||||||
row = row + str(randint(0, 9))
|
row += str(randint(0, 9))
|
||||||
return row
|
return row
|
||||||
|
|
||||||
|
|
||||||
# Function for writing data into a file (content = string, setcount and partstart are for better naming)
|
# Function for writing data into a file (content = string, setcount and partstart are for better naming)
|
||||||
def writeFile(content, setcounter, partstart):
|
def writeFile(content, setcounter, partstart):
|
||||||
filenumber = int(setcounter) + int(partstart)
|
filenumber = int(setcounter) + int(partstart)
|
||||||
file = open("testdata/file" + str(filenumber) + ".txt", "w")
|
file = open('testdata/file' + str(filenumber) + '.txt', 'w')
|
||||||
for w in range(0, len(content)):
|
|
||||||
file.write(content[w] + "\n")
|
for line in content:
|
||||||
|
file.write(line + '\n')
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Getting the user input
|
# Getting the user input
|
||||||
print("Hello World")
|
print('Hello World')
|
||||||
units = int(input("How many units would you like to generate? "))
|
units = int(input('How many units would you like to generate? '))
|
||||||
rows = int(input("How many rows should each unit have? "))
|
rows = int(input('How many rows should each unit have? '))
|
||||||
rowLength = int(input("How long should each row be? "))
|
rowLength = int(input('How long should each row be? '))
|
||||||
cores = int(input("How many cores do you want to use? "))
|
cores = int(input('How many cores do you want to use? '))
|
||||||
cluster = int(input("What fraction of postal codes should be in the 09xxx cluster? 1/"))
|
cluster = int(input('What fraction of postal codes should be in the 09xxx cluster? 1/'))
|
||||||
|
|
||||||
# Splitting up the units
|
# Splitting up the units
|
||||||
count = int(0)
|
count = 0
|
||||||
partsize = units / cores
|
partsize = units / cores
|
||||||
|
|
||||||
# For benchmarking starting the timer now
|
# For benchmarking starting the timer now
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
# Initialize and prepare cores for process
|
# Initialize and prepare cores for process
|
||||||
while count < cores:
|
while count < cores:
|
||||||
partstart = partsize * count
|
partstart = partsize * count
|
||||||
globals()["p" + str(count)] = multiprocessing.Process(target=randomI, args=(int(partsize), rows, rowLength, partstart, cluster))
|
globals()['p' + str(count)] = multiprocessing.Process(
|
||||||
count = count + 1
|
target=randomI,
|
||||||
|
args=(int(partsize), rows, rowLength, partstart, cluster)
|
||||||
|
)
|
||||||
|
count += 1
|
||||||
|
|
||||||
# Starting each core
|
# Starting each core
|
||||||
count = int(0)
|
count = int(0)
|
||||||
while count < cores:
|
while count < cores:
|
||||||
globals()["p" + str(count)].start()
|
globals()['p' + str(count)].start()
|
||||||
print("Core " + str(count) + " started.")
|
print('Core ' + str(count) + ' started.')
|
||||||
count = count + 1
|
count += 1
|
||||||
|
|
||||||
print("Working...")
|
print('Working...')
|
||||||
|
|
||||||
# Joining each core for the process
|
# Joining each core for the process
|
||||||
count = int(0)
|
count = 0
|
||||||
while count < cores:
|
while count < cores:
|
||||||
globals()["p" + str(count)].join()
|
globals()['p' + str(count)].join()
|
||||||
count = count + 1
|
count += 1
|
||||||
|
|
||||||
# Finishing up the process
|
# Finishing up the process
|
||||||
sec = time.time() - start_time
|
sec = time.time() - start_time
|
||||||
print("Data is generated. Have fun!")
|
print('Data is generated. Have fun!')
|
||||||
print("randomI took " + str(sec) + " seconds for execution.")
|
print('randomI took ' + str(sec) + ' seconds for execution.')
|
||||||
|
|||||||
Reference in New Issue
Block a user