diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ebb9095 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.env/ +__pycache__/ +dist/ +*.egg-info/ diff --git a/README.md b/README.md index 73d696d..2bdaeea 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,12 @@ Well that is basically up to you. Entropy functions are used in Computer Science *Update:* This script now can calculate the maximum entropy now too. This is pretty useful for pre-compression analyses. Maximum entropy is calculated by splitting the alphabet into parts of the same size and calculating the entropy of this, like: `-1 * SIZE_OF_ALPHABET * (DISTINCT_PROBABILITY * log(DISTINCT_PROBABILITY, 2))`. +## Installing +You can install this package easily with `pip`: +``` +$ pip install git+https://github.com/creyD/entro.py@dev_package +``` + ## Usage You can run as much calculations as you want in one run of the script. For example use it like this with a simple string (you can skip the quotation marks if you don't have spaces in your string - if you want): diff --git a/entro.py b/entro.py deleted file mode 100644 index eea5503..0000000 --- a/entro.py +++ /dev/null @@ -1,90 +0,0 @@ -''' - calc_entro.py calculates the entropy of a given string or file - - This uses the negative sum of the log (to the base of 2) of the probability - times the probability of a char to occur in a certain string as the entropy. -''' - -import math -import argparse - - -# Calculates the entropy of a given string (as described in the docstring) -def calculateEntropy(input_string): - alphabet, alphabet_size, entropy = {}, len(input_string), 0 - - for char in input_string: - if char in alphabet: - alphabet[char] += 1 - else: - alphabet[char] = 1 - - for char in alphabet: - alphabet[char] = alphabet[char] / alphabet_size - entropy -= alphabet[char] * math.log(alphabet[char], 2) - - max_entropy = - len(alphabet) * (1/len(alphabet) * math.log(1/len(alphabet), 2)) - return entropy, alphabet, max_entropy - - -# Outputs a given entropy including the original text and the alphabet with probabilities -def printEntropy(original_string, entropy_value, alphabet_dict, simple_bool, max_value): - print('---') - if not simple_bool: - print('Content: ' + original_string) - print('Probabilities: ' + str(alphabet_dict)) - print('Entropy: ' + str(entropy_value) + ' bits') - if max_value: - print('Maximum Entropy: ' + str(max_value) + ' bits') - print('---') - - -# Reads a file by a given path -def readEntropyFile(path_string): - f = open(path_string, 'r') - content = f.read().replace('\n', ' ') - f.close() - return content.strip() - - -# List of the arguments one can use to influence the behavior of the program -parser = argparse.ArgumentParser(description='Calculate the information entropy of alphabets.') - -# INPUT ARGUMENTS -parser.add_argument('strings', nargs='*', default='', type=str, help='Strings to calculate the entropy of.') -parser.add_argument('--files', nargs='*', type=str, default='', help='Provide file path(s) to calculate the entropy of.') - -# OUTPUT OPTIONS -parser.add_argument('--simple', nargs='?', type=bool, default=False, help='Determines the explicitness of the output. (True = only entropy shown)') -parser.add_argument('--max', nargs='?', type=bool, default=False, help='Includes the maximum entropy.') - -# CONVERT OPTIONS -parser.add_argument('--lower', nargs='?', type=bool, default=False, help='Converts given strings or textfiles to lowercase before calculating.') -parser.add_argument('--upper', nargs='?', type=bool, default=False, help='Converts given strings or textfiles to uppercase before calculating.') -parser.add_argument('--squash', nargs='?', type=bool, default=False, help='Removes all whitespaces before calculating.') -args = parser.parse_args() - -# Prepares the queue of different strings -queue = [] - -# Add all the provided strings to the list -for string in args.strings: - queue.append(string) - -# Add all the provided files to the list -for file in args.files: - string = readEntropyFile(file) - queue.append(string) - -# Interates over the collected strings and prints the entropies -for string in queue: - if args.lower: - string = string.lower() - elif args.upper: - string = string.upper() - - if args.squash: - string = string.replace(" ", "") - - a, b, c = calculateEntropy(string) - printEntropy(string, a, b, args.simple, (False if not args.max else c)) diff --git a/entro_py_min/__main__.py b/entro_py_min/__main__.py new file mode 100644 index 0000000..e1fdc02 --- /dev/null +++ b/entro_py_min/__main__.py @@ -0,0 +1,45 @@ +from . import entro_py_min +import argparse + + +# List of the arguments one can use to influence the behavior of the program +parser = argparse.ArgumentParser('entro_py_min', description='Calculate the information entropy of alphabets.') + +# INPUT ARGUMENTS +parser.add_argument('strings', nargs='*', default='', type=str, help='Strings to calculate the entropy of.') +parser.add_argument('--files', nargs='*', type=str, default='', help='Provide file path(s) to calculate the entropy of.') + +# OUTPUT OPTIONS +parser.add_argument('--simple', nargs='?', type=bool, default=False, help='Determines the explicitness of the output. (True = only entropy shown)') +parser.add_argument('--max', nargs='?', type=bool, default=False, help='Includes the maximum entropy.') + +# CONVERT OPTIONS +parser.add_argument('--lower', nargs='?', type=bool, default=False, help='Converts given strings or textfiles to lowercase before calculating.') +parser.add_argument('--upper', nargs='?', type=bool, default=False, help='Converts given strings or textfiles to uppercase before calculating.') +parser.add_argument('--squash', nargs='?', type=bool, default=False, help='Removes all whitespaces before calculating.') +args = parser.parse_args() + +# Prepares the queue of different strings +queue = [] + +# Add all the provided strings to the list +for string in args.strings: + queue.append(string) + +# Add all the provided files to the list +for file in args.files: + string = entro_py_min.readEntropyFile(file) + queue.append(string) + +# Interates over the collected strings and prints the entropies +for string in queue: + if args.lower: + string = string.lower() + elif args.upper: + string = string.upper() + + if args.squash: + string = string.replace(" ", "") + + a, b, c = entro_py_min.calculateEntropy(string) + entro_py_min.printEntropy(string, a, b, args.simple, (False if not args.max else c)) diff --git a/entro_py_min/entro_py_min.py b/entro_py_min/entro_py_min.py new file mode 100644 index 0000000..cb42844 --- /dev/null +++ b/entro_py_min/entro_py_min.py @@ -0,0 +1,57 @@ +import math + + +# Calculates the entropy of a given string +# Returns the entropy and an alphabet with the calculated probabilities +def calculateEntropy(input_string): + alphabet, alphabet_size, entropy = {}, len(input_string), 0 + + for char in input_string: + if char in alphabet: + alphabet[char] += 1 + else: + alphabet[char] = 1 + + for char in alphabet: + alphabet[char] = alphabet[char] / alphabet_size + entropy -= alphabet[char] * math.log(alphabet[char], 2) + + max_entropy = - len(alphabet) * (1 / len(alphabet) * math.log(1 / len(alphabet), 2)) + return entropy, alphabet, max_entropy + + +# Calculates the entropy of a given string +# Returns only the entropy in bits as this is the minimal function +def calculateEntropyMin(input_string): + alphabet, alphabet_size, entropy = {}, len(input_string), 0 + + for char in input_string: + if char in alphabet: + alphabet[char] += 1 + else: + alphabet[char] = 1 + + for char in alphabet: + i = alphabet[char] / alphabet_size + entropy -= i * math.log(i, 2) + return entropy + + +# Outputs a given entropy including the original text and the alphabet with probabilities +def printEntropy(original_string, entropy_value, alphabet_dict, simple_bool, max_value): + print('---') + if not simple_bool: + print('Content: ' + original_string) + print('Probabilities: ' + str(alphabet_dict)) + print('Entropy: ' + str(entropy_value) + ' bits') + if max_value: + print('Maximum Entropy: ' + str(max_value) + ' bits') + print('---') + + +# Reads a file by a given path +def readEntropyFile(path_string): + f = open(path_string, 'r') + content = f.read().replace('\n', ' ') + f.close() + return content.strip() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..38c5cc1 --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="entro_py_min", + version="0.0.1", + author="Conrad Großer", + author_email="grosserconrad@gmail.com", + description="Small Information Entropy Calculator", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/creyD/entro.py", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], +)