From 204d93ee886b7ee7eee91c0118275a9842c45cb6 Mon Sep 17 00:00:00 2001
From: garimoo <garim95@hanmail.net>
Date: Sat, 11 Nov 2023 16:09:19 +0900
Subject: [PATCH] Release rks version 1.0.0

---
 .gitignore         | 164 +++++++++++++++++++++++++++++++++++++++++++++
 README.md          | 112 ++++++++++++++++++++++++++++++-
 requirements.txt   |   3 +
 rks/__init__.py    |   0
 rks/analysis.py    |  40 +++++++++++
 rks/display.py     |  57 ++++++++++++++++
 rks/main.py        |  91 +++++++++++++++++++++++++
 rks/redis_utils.py | 147 ++++++++++++++++++++++++++++++++++++++++
 setup.py           |  26 +++++++
 9 files changed, 638 insertions(+), 2 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 requirements.txt
 create mode 100644 rks/__init__.py
 create mode 100644 rks/analysis.py
 create mode 100644 rks/display.py
 create mode 100644 rks/main.py
 create mode 100644 rks/redis_utils.py
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7d930d1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,164 @@
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+.pypirc
+.DS_Store
diff --git a/README.md b/README.md
index 976e083..369368b 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,110 @@
-# redis-keys-statistics
-rks(redis-keys-statistics) is a Python tool for analyzing and reporting key usage statistics in Redis databases, including memory usage and type distribution, created by garimoo and is under the copyright of Woowabros
+# Redis Keys Statistics
+
+## Overview
+`redis-keys-statistics` is a Python tool, designed for analyzing and reporting key usage statistics in Redis databases with exceptional speed and efficiency. By leveraging Lua scripting to reduce network I/O, it dramatically outperforms other open-source tools — tasks that typically take hours are completed in just minutes.
+
+## Features
+- **High Performance**: Utilizes Lua scripting to minimize network I/O, delivering results significantly faster than traditional methods. Where some tools might take hours, `redis-keys-statistics` can complete the same task in a fraction of the time.
+- **Efficient Key Scanning**: Uses the Redis `SCAN` command for batch scanning of keys, further optimizing performance.
+- **Memory Usage Statistics**: Reports memory usage for keys in various units (B, KB, MB, GB).
+- **Top N Largest Keys**: Quickly identifies the largest keys for memory optimization.
+- **Key Count by Type**: Provides a breakdown of keys by type for better data structure insights.
+- **Prefix-Based Analysis**: Analyzes keys based on prefixes to understand namespace usage.
+- **Cluster Support**: Compatible with Redis clusters, including replica-only analysis.
+- **Readable Output**: Formatted and easy-to-read statistical tables using `prettytable`.
+
+## Installation
+```bash
+pip install rks
+```
+
+## Usage
+Execute the script with the necessary Redis connection parameters:
+```bash
+rks --host <REDIS_HOST> --port <REDIS_PORT> [--password <REDIS_PASSWORD>]
+```
+For detailed usage options:
+```bash
+rks --help
+```
+
+## Command Line Interface
+```
+usage: rks [-h] --host HOST --port PORT [--password PASSWORD] [--cluster] [--batch_size BATCH_SIZE] [--replica_only] [--pretty_format]
+
+Analyze Redis Instance Key Statistics.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --host HOST           Redis host
+  --port PORT           Redis port
+  --password PASSWORD   Redis password
+  --cluster             Enable cluster mode
+  --batch_size BATCH_SIZE
+                        Batch size for SCAN command
+  --replica_only        Execute only on replica instances
+  --pretty_format       Display output in a human-readable format
+```
+
+## Example Output
+Here is an example of what the output from `redis-keys-statistics` might look like:
+
+### Top 20 Largest Keys in Redis
+```
++-------------------+------+---------+------------+-------+
+| Key               | Type | Size    | Size Ratio |  TTL  |
++-------------------+------+---------+------------+-------+
+| user_sessions:123 | hash | 1.2 MB  | 150% ↑     |  360  |
+| cache:page:001    | zset | 900 KB  | 120% ↑     |  -1   |
+| config:app        | hash | 800 KB  | 100% ↑     | 86400 |
+| queue:jobs        | list | 600 KB  | 80% ↑      |  -1   |
+| temp:data:456     | set  | 500 KB  | 60% ↑      | 1800  |
+| ...               | ...  | ...     | ...        |  ...  |
++-------------------+------+---------+------------+-------+
+
+```
+
+### Key Count by Type
+```
++--------+-------+
+|  Type  | Count |
++--------+-------+
+| hash   | 250   |
+| zset   | 150   |
+| list   | 100   |
+| set    | 75    |
+| string | 200   |
++--------+-------+
+
+```
+
+### Detailed Prefix Statistics
+```
++-------------+-------+--------------+---------+-----------------+
+| Prefix Name | Count | Average Size | Max TTL | Types           |
++-------------+-------+--------------+---------+-----------------+
+| user        | 100   | 200 KB       | 3600    | - Type: hash    |
+|             |       |              |         |   Count: 50     |
+|             |       |              |         | - Type: string  |
+|             |       |              |         |   Count: 50     |
+| cache       | 80    | 150 KB       | -1      | - Type: zset    |
+|             |       |              |         |   Count: 80     |
+| config      | 20    | 100 KB       | 86400   | - Type: hash    |
+|             |       |              |         |   Count: 20     |
+| temp        | 150   | 50 KB        | 1800    | - Type: set     |
+|             |       |              |         |   Count: 100    |
+|             |       |              |         | - Type: list    |
+|             |       |              |         |   Count: 50     |
++-------------+-------+--------------+---------+-----------------+
+
+```
+
+## Requirements
+- Python 3.x
+- Redis server or cluster
+
+## Contributing
+Contributions are welcome. 
+
+## Acknowledgements
+Special thanks to all contributors and users of the `redis-keys-statistics` tool.
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..eadcc73
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+redis
+redis-py-cluster
+prettytable
diff --git a/rks/__init__.py b/rks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rks/analysis.py b/rks/analysis.py
new file mode 100644
index 0000000..0cb9682
--- /dev/null
+++ b/rks/analysis.py
@@ -0,0 +1,40 @@
+import heapq
+
+def format_memory_size(size):
+    if size < 1024:
+        return f"{size: .2f} B"
+    elif size < 1024 * 1024:
+        return f"{size / 1024:.2f} KB"
+    elif size < 1024 * 1024 * 1024:
+        return f"{size / (1024 * 1024):.2f} MB"
+    else:
+        return f"{size / (1024 * 1024 * 1024):.2f} GB"
+
+
+def update_topk_heap(item, min_heap, heap_size):
+    heapq.heappush(min_heap, (item[2], item))
+    if len(min_heap) > heap_size:
+        heapq.heappop(min_heap)
+
+
+def update_statistics(item, prefix_statistics_map):
+    key = item[0]
+    key_type = item[1]
+    memory = int(item[2])
+    ttl = item[3]
+
+    if b':' in key:
+        key_prefix = key.split(b':')[0]
+        if key_prefix not in prefix_statistics_map:
+            prefix_statistics_map[key_prefix] = {}
+            prefix_statistics_map[key_prefix]['count'] = 0
+            prefix_statistics_map[key_prefix]['total_size'] = 0
+            prefix_statistics_map[key_prefix]['max_ttl'] = -1
+            prefix_statistics_map[key_prefix]['type_count'] = {}
+        prefix_statistics_map[key_prefix]['count'] += 1
+        prefix_statistics_map[key_prefix]['total_size'] += memory
+        prefix_statistics_map[key_prefix]['max_ttl'] = max(prefix_statistics_map[key_prefix]['max_ttl'], ttl)
+
+        if key_type not in prefix_statistics_map[key_prefix]['type_count']:
+            prefix_statistics_map[key_prefix]['type_count'][key_type] = 0
+        prefix_statistics_map[key_prefix]['type_count'][key_type] += 1
\ No newline at end of file
diff --git a/rks/display.py b/rks/display.py
new file mode 100644
index 0000000..a6f821e
--- /dev/null
+++ b/rks/display.py
@@ -0,0 +1,57 @@
+from prettytable import PrettyTable
+from .analysis import format_memory_size
+
+def analyze_redis_keys(min_heap, prefix_statistics_map, total_key_size, total_key_count, key_count_by_type, db_num, use_pretty):
+
+    average_key_size = total_key_size / total_key_count if total_key_count != 0 else 0
+
+    print("\n" + "-"*25 + f"\nAnalyzing DB {db_num}\n" + "-"*25 + "\n")
+
+    table = PrettyTable()
+    table.title = "Top 20 largest keys in Redis"
+    table.field_names = ["Key", "Type", "Size", "Size Ratio", "TTL"]
+
+    sorted_items = sorted(min_heap, key=lambda x: x[0], reverse=True)
+
+    for value, item in sorted_items:
+        key = item[0]
+        key_type = item[1]
+        memory = int(item[2])
+        ttl = item[3]
+
+        size_increase_percentage = (memory - average_key_size) / average_key_size * 100
+        formatted_size_ratio = "{:.1f}% ↑".format(size_increase_percentage)
+
+        if use_pretty:
+            memory = format_memory_size(memory)
+
+        table.add_row([key.decode('utf-8', errors='ignore'), key_type.decode('utf-8'), memory, formatted_size_ratio, ttl])
+
+    print(table)
+
+    table = PrettyTable()
+    table.title = "Key Count by Type"
+    table.field_names = ["Type", "Count"]
+
+    for key_type, count in key_count_by_type.items():
+        table.add_row([key_type.decode('utf-8', errors='ignore'), count])
+
+    print(table)
+
+    table = PrettyTable()
+    table.title = "Detailed Prefix Statistics"
+    table.field_names = ["Prefix Name", "Count", "Average Size", "Max TTL", "Types"]
+
+    for prefix, item in prefix_statistics_map.items():
+
+        prefix_average_size = round(item['total_size'] / item['count'], 2) if item['count'] != 0 else 0
+
+        if use_pretty:
+            prefix_average_size = format_memory_size(prefix_average_size)
+
+        types = ""
+        for key_type, type_count in item['type_count'].items():
+            types += " - Type: {}, Count: {}\n".format(key_type.decode('utf-8'), type_count)
+        table.add_row([prefix.decode('utf-8', errors='ignore'), item['count'], prefix_average_size, item['max_ttl'], types])
+
+    print(table)
\ No newline at end of file
diff --git a/rks/main.py b/rks/main.py
new file mode 100644
index 0000000..fa317dd
--- /dev/null
+++ b/rks/main.py
@@ -0,0 +1,91 @@
+import argparse
+import redis
+import rediscluster
+from datetime import datetime
+from .redis_utils import get_redis_keys, get_redis_cluster_keys
+
+def main():
+    parser = argparse.ArgumentParser(description='Analyze Redis Instance Key Statistics.')
+    parser.add_argument('--host', required=True, help='Redis host')
+    parser.add_argument('--port', required=True, help='Redis port')
+    parser.add_argument('--password', default=None, help='Redis password')
+    parser.add_argument('--cluster', action='store_true', help='Enable cluster mode')
+    parser.add_argument('--batch_size', type=int, default=1000, help='Batch size for SCAN command')
+    parser.add_argument('--replica_only', action='store_true', help='Execute only on replica instances')
+    parser.add_argument('--pretty_format', action='store_true', help='Display output in a human-readable format')
+
+    args = parser.parse_args()
+    CONNECTION_TIMEOUT = 10
+
+    if args.cluster:
+        try:
+            rc = rediscluster.RedisCluster(startup_nodes=[{"host": args.host, "port": int(args.port)}],
+                                           skip_full_coverage_check=True,
+                                           password=args.password,
+                                           socket_timeout=CONNECTION_TIMEOUT)
+
+            redis_info = rc.info()
+
+            if list(redis_info.values())[0].get('cluster_enabled') != 1:
+                print("Error: Specified Redis instance is not running in cluster mode but --cluster flag was provided.")
+                return
+
+            process_start_time = datetime.now()
+
+            if get_redis_cluster_keys(rc, args.batch_size, args.replica_only, args.pretty_format) == -1:
+                print(f"Aborted the operation on non-readonly redis at host: {args.host}")
+                return
+
+            rc.close()
+
+            process_end_time = datetime.now()
+            process_taken_time = str(process_end_time - process_start_time).split(".")[0]
+            print(f'Process completed in {process_taken_time} (HH:MM:SS)')
+
+        except rediscluster.exceptions.RedisClusterError as e:
+            print(f"Error connecting to Redis cluster: {args.host}. Error message: {str(e)}")
+            return
+        except rediscluster.exceptions.RedisClusterException as e:
+            if "ERROR sending 'cluster slots' command to redis server" in str(e):
+                print("Error: Specified Redis instance is not running in cluster mode but --cluster flag was provided.")
+                return
+            else:
+                print(f"Failed to connect to Redis cluster: {args.host}. Error message: {str(e)}")
+                return
+    else:
+        try:
+            r = redis.Redis(host=args.host, port=int(args.port),
+                            password=args.password,
+                            socket_timeout=CONNECTION_TIMEOUT)
+
+            redis_info = r.info()
+
+            if redis_info.get('cluster_enabled') != 0:
+                print("Error: Specified Redis instance is running in cluster mode but --cluster flag was not provided.")
+                return
+
+            if redis_info.get('role') != 'slave' and args.replica_only:
+                print(f"Aborted the operation on non-readonly redis at host: {args.host}")
+                return
+
+            process_start_time = datetime.now()
+
+            for db in redis_info.keys():
+                if db.startswith('db'):
+                    db_num = int(db[2:])
+                    get_redis_keys(r, args.batch_size, db_num, args.pretty_format)
+
+            r.close()
+
+            process_end_time = datetime.now()
+            process_taken_time = str(process_end_time - process_start_time).split(".")[0]
+            print(f'Process completed in {process_taken_time} (HH:MM:SS)')
+
+        except redis.exceptions.TimeoutError as e:
+            print(f"Timeout connecting to Redis: {args.host}. Error message: {str(e)}")
+        except redis.exceptions.ConnectionError as e:
+            print(f"Failed to connect to Redis: {args.host}. Error message: {str(e)}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/rks/redis_utils.py b/rks/redis_utils.py
new file mode 100644
index 0000000..6d52296
--- /dev/null
+++ b/rks/redis_utils.py
@@ -0,0 +1,147 @@
+import redis
+import time
+from .display import analyze_redis_keys
+from .analysis import update_topk_heap, update_statistics
+
+
+def get_redis_keys(r, batch_size, db_num, use_pretty):
+
+    r.execute_command('SELECT', db_num)
+
+    min_heap = []
+    heap_size = 20
+    prefix_statistics_map = {}
+
+    total_key_count = 0
+    total_key_size = 0
+    key_count_by_type = {}
+
+    script = """
+            local cursor = ARGV[1]
+            local result = {}
+
+            local scanResult = redis.call('SCAN', cursor, 'COUNT', BATCH_SIZE_PLACEHOLDER)
+            cursor = scanResult[1]
+
+            for i, key in ipairs(scanResult[2]) do
+                local key_type = redis.call('TYPE', key)['ok']
+                local memory = redis.call('MEMORY', 'USAGE', key)
+                local ttl = redis.call('TTL', key)
+
+                table.insert(result, {key, key_type, memory, ttl})
+            end
+
+            return {cursor, result}
+        """
+
+    script = script.replace("BATCH_SIZE_PLACEHOLDER", str(batch_size))
+
+    cursor = b'0'
+    while True:
+        cursor, result = r.eval(script, 0, cursor)
+
+        for item in result:
+            key_type = item[1]
+            memory = int(item[2])
+
+            total_key_count += 1
+            total_key_size += memory
+
+            if key_type not in key_count_by_type:
+                key_count_by_type[key_type] = 0
+            key_count_by_type[key_type] += 1
+
+            update_topk_heap(item, min_heap, heap_size)
+            update_statistics(item, prefix_statistics_map)
+
+        if cursor == b'0':
+            break
+
+        time.sleep(0.01)
+
+    return analyze_redis_keys(min_heap, prefix_statistics_map, total_key_size, total_key_count, key_count_by_type, db_num, use_pretty)
+
+
+def get_redis_cluster_keys(rc, batch_size, replica_only, use_pretty):
+    slave_flag = False
+
+    min_heap = []
+    heap_size = 20
+    prefix_statistics_map = {}
+
+    total_key_count = 0
+    total_key_size = 0
+    key_count_by_type = {}
+
+    nodes = rc.cluster_nodes()
+
+    masters = []
+    for node in nodes:
+        if 'master' in node['flags']:
+            master_dict = {'master': node['id'], 'slots': node['slots'], 'slaves': []}
+            masters.append(master_dict.copy())
+
+    for node in nodes:
+        if 'slave' in node['flags']:
+            slave_flag = True
+            for master in masters:
+                if master['master'] == node['master']:
+                    master['slaves'].append(node['id'])
+
+    if slave_flag is False and replica_only:
+        return -1
+
+    for master in masters:
+        if slave_flag is False:
+            node = next((node for node in nodes if node['id'] == master['master']), None)
+        else:
+            node = next((node for node in nodes if node['id'] == master['slaves'][0]), None)
+        r = redis.Redis(host=node['host'], port=node['port'])
+        r.execute_command('READONLY')
+
+        script = """
+                    local cursor = ARGV[1]
+                    local result = {}
+
+                    local scanResult = redis.call('SCAN', cursor, 'COUNT', BATCH_SIZE_PLACEHOLDER)
+                    cursor = scanResult[1]
+
+                    for i, key in ipairs(scanResult[2]) do
+                        local key_type = redis.call('TYPE', key)['ok']
+                        local memory = redis.call('MEMORY', 'USAGE', key)
+                        local ttl = redis.call('TTL', key)
+
+                        table.insert(result, {key, key_type, memory, ttl})
+                    end
+
+                    return {cursor, result}
+                    """
+
+        script = script.replace("BATCH_SIZE_PLACEHOLDER", str(batch_size))
+
+        cursor = b'0'
+        while True:
+            cursor, result = r.eval(script, 0, cursor)
+
+            for item in result:
+                key_type = item[1]
+                memory = int(item[2])
+
+                total_key_count += 1
+                total_key_size += memory
+
+                if key_type not in key_count_by_type:
+                    key_count_by_type[key_type] = 0
+                key_count_by_type[key_type] += 1
+
+                update_topk_heap(item, min_heap, heap_size)
+                update_statistics(item, prefix_statistics_map)
+
+            if cursor == b'0':
+                break
+
+            time.sleep(0.01)
+
+        r.close()
+
+    return analyze_redis_keys(min_heap, prefix_statistics_map, total_key_size, total_key_count, key_count_by_type, 0, use_pretty)
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..47e15da
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='rks',
+    version='1.0.0',
+    description='rks(redis-keys-statistics) is a Python tool for analyzing and reporting key usage statistics in Redis databases, including memory usage and type distribution, created by garimoo and is under the copyright of Woowabros',
+    author='garimoo',
+    author_email='garimoo.kim@gmail.com',
+    install_requires=['redis', 'redis-py-cluster', 'prettytable',],
+    packages=find_packages(),
+    entry_points={
+        'console_scripts': [
+            'rks = rks.main:main'
+        ]
+    },
+    keywords=['redis', 'pypi', 'redis keys statistics', 'rks', 'redis statistics', 'garim', 'garimoo'],
+    python_requires='>=3.6',
+    package_data={},
+    zip_safe=False,
+    classifiers=[
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+    ],
+)
\ No newline at end of file