From 04f681dba6f09551805c19df58c8035581be610e Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 00:40:28 +0200 Subject: [PATCH 01/20] %words improvement --- README.md | 4 +++- src/main.py | 12 ++++++------ src/scanners/words_scanner.py | 19 +++++++------------ 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 881d143..caac0c2 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ * %react - rank users by their reactions * arguments: * - top messages, default is 10 -* %words - rank words by their usage +* %words - (BETA) rank words by their usage * arguments: * - words containings or more letters, default is 3 * - top words, default is 10 @@ -104,6 +104,8 @@ python3 src/main.py ## Changelog +* **v1.13** + * improved scan `%words` * **v1.12** * more scans: `%words` * concurrent `fast` analysis diff --git a/src/main.py b/src/main.py index 565e92d..4ac9b5f 100644 --- a/src/main.py +++ b/src/main.py @@ -45,6 +45,12 @@ bot.register_command( "cancel: stop current analysis (not launched with fast)", "```\n" + "%cancel: Stop current analysis (not launched with fast)\n" + "```", ) +bot.register_command( + "words", + lambda *args: WordsScanner().compute(*args), + "words: (BETA) rank words by their usage", + WordsScanner.help(), +) bot.register_command( "last", lambda *args: LastScanner().compute(*args), @@ -63,12 +69,6 @@ bot.register_command( "first: read first message", FirstScanner.help(), ) -bot.register_command( - "words", - lambda *args: WordsScanner().compute(*args), - "words: rank words by their usage", - WordsScanner.help(), -) bot.register_command( "mentioned", lambda *args: MentionedScanner().compute(*args), diff --git a/src/scanners/words_scanner.py b/src/scanners/words_scanner.py index 80d0971..c31bf9b 100644 --- a/src/scanners/words_scanner.py +++ b/src/scanners/words_scanner.py @@ -20,7 +20,7 @@ class WordsScanner(Scanner): def help() -> str: return ( "```\n" - + "%words: Rank words by their usage\n" + + "%words: (BETA) Rank words by their usage\n" + "arguments:\n" + COMMON_HELP_ARGS + "* - words containings or more letters, default is 3\n" @@ -104,16 +104,13 @@ class WordsScanner(Scanner): or message.author in raw_members ): impacted = True - content = " ".join( - [ - block - for block in message.content.split() - if not re.match(r"^\w+:\/\/", block) - ] - ) + content = message.content + content = re.sub(r"```.+```", "", content, flags=re.DOTALL) + content = re.sub(r"`.+`", "", content, flags=re.DOTALL) + content = re.sub(r"\w+:\/\/[^ ]+", "", content) for word in re.split("[^\w\-':]", content): m = re.match( - r"(?!^:\w+:$)^[^\w]*((?![\d_])\w.*(?![\d_])\w)[^\w]*$", word + r"(?!^:\w+:$)^[^\w]*((?![\d_])\w[\w\-']*(?![\d_])\w)[^\w]*$", word ) if m: word = m[1].lower() @@ -126,7 +123,5 @@ class WordsScanner(Scanner): words[word] = words[word + case] del words[word + case] break - words[word].update_use( - message.content.count(word), message.created_at - ) + words[word].update_use(1, message.created_at) return impacted From a26b90f3928e71e970b93b88ee46c05ead9d1cc4 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 00:41:54 +0200 Subject: [PATCH 02/20] simple CI --- .github/workflows/python.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/python.yml diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 0000000..68ceedd --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,29 @@ +name: Python + +on: ["push", "pull_request"] + +jobs: + syntax: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.7, 3.8, 3.9] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 black + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Code style with black + run: | + black --check \ No newline at end of file From ee71314c41404d1a25744c30c63bb241be981e1d Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 00:45:57 +0200 Subject: [PATCH 03/20] removed black check --- .github/workflows/python.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 68ceedd..75757c7 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -23,7 +23,4 @@ jobs: # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Code style with black - run: | - black --check \ No newline at end of file + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics \ No newline at end of file From 6cacb832bf18ec962e391d7a5f6f47dff121ddd3 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 00:46:36 +0200 Subject: [PATCH 04/20] removed black check --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 75757c7..5029c28 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 black + python -m pip install flake8 - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names From 48c4e82cdfce7b39e5e12625685b83b5df6f303a Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 12:19:43 +0200 Subject: [PATCH 05/20] remove old and unused logs at start and guild leaving --- README.md | 1 + src/logs/guild_logs.py | 24 +++++++++++++++++++++++- src/main.py | 14 ++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index caac0c2..c130ef5 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,7 @@ python3 src/main.py * **v1.13** * improved scan `%words` + * remove old and unused logs at start and guild leaving * **v1.12** * more scans: `%words` * concurrent `fast` analysis diff --git a/src/logs/guild_logs.py b/src/logs/guild_logs.py index 88d8823..6fe0201 100644 --- a/src/logs/guild_logs.py +++ b/src/logs/guild_logs.py @@ -15,6 +15,7 @@ from utils import code_message, delta, deltas LOG_DIR = "logs" +LOG_EXT = ".logz" current_analysis = [] current_analysis_lock = threading.Lock() @@ -23,7 +24,10 @@ current_analysis_lock = threading.Lock() ALREADY_RUNNING = -100 CANCELLED = -200 +# 5 minutes, assume 'fast' arg MIN_MODIFICATION_TIME = 5 * 60 +# ~6 months, remove log file +MAX_MODIFICATION_TIME = 6 * 30.5 * 24 * 60 * 60 class Worker: @@ -54,7 +58,7 @@ class GuildLogs: def __init__(self, guild: discord.Guild): self.id = guild.id self.guild = guild - self.log_file = os.path.join(LOG_DIR, f"{guild.id}.logz") + self.log_file = os.path.join(LOG_DIR, f"{guild.id}{LOG_EXT}") self.channels = {} self.locked = False @@ -322,3 +326,21 @@ class GuildLogs: f"No cancellable analysis are currently running on this server", reference=message, ) + + @staticmethod + def check_logs(guilds: List[discord.Guild]): + logging.info(f"checking logs...") + guild_ids = [str(guild.id) for guild in guilds] + for item in os.listdir(LOG_DIR): + path = os.path.join(LOG_DIR, item) + name, ext = os.path.splitext(item) + if os.path.isfile(path) and ext == LOG_EXT: + if ( + name in guild_ids + and (time.time() - os.path.getmtime(path)) > MAX_MODIFICATION_TIME + ): + logging.info(f"> removing old log '{path}'") + os.unlink(path) + elif name not in guild_ids: + logging.info(f"> removing unused log '{path}'") + os.unlink(path) diff --git a/src/main.py b/src/main.py index 4ac9b5f..69f6b0a 100644 --- a/src/main.py +++ b/src/main.py @@ -39,6 +39,20 @@ bot = Bot( bot.log_calls = True + +async def on_ready(): + GuildLogs.check_logs(bot.client.guilds) + return True + + +async def on_guild_remove(): + GuildLogs.check_logs(bot.client.guilds) + return True + + +bot.register_event(on_ready) +bot.register_event(on_guild_remove) + bot.register_command( "(cancel|stop)", GuildLogs.cancel, From 0550a16c51f8e3899e8a9bb21c47433fe114a73f Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 12:20:36 +0200 Subject: [PATCH 06/20] create log dir before checking --- src/logs/guild_logs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/logs/guild_logs.py b/src/logs/guild_logs.py index 6fe0201..3c4af39 100644 --- a/src/logs/guild_logs.py +++ b/src/logs/guild_logs.py @@ -330,6 +330,8 @@ class GuildLogs: @staticmethod def check_logs(guilds: List[discord.Guild]): logging.info(f"checking logs...") + if not os.path.exists(LOG_DIR): + os.mkdir(LOG_DIR) guild_ids = [str(guild.id) for guild in guilds] for item in os.listdir(LOG_DIR): path = os.path.join(LOG_DIR, item) From 6a70663201a95471365e287bd2638de7d4df8877 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 14:57:55 +0200 Subject: [PATCH 07/20] gdpr agreements --- src/logs/__init__.py | 2 +- src/logs/guild_logs.py | 117 +++++++++++++++++++++++----------------- src/main.py | 10 +++- src/scanners/scanner.py | 13 ++++- src/utils/gdpr.py | 68 +++++++++++++++++++++++ 5 files changed, 157 insertions(+), 53 deletions(-) create mode 100644 src/utils/gdpr.py diff --git a/src/logs/__init__.py b/src/logs/__init__.py index 358e9af..d62ab1d 100644 --- a/src/logs/__init__.py +++ b/src/logs/__init__.py @@ -1,3 +1,3 @@ from .message_log import MessageLog from .channel_logs import ChannelLogs -from .guild_logs import GuildLogs, ALREADY_RUNNING, CANCELLED +from .guild_logs import GuildLogs, ALREADY_RUNNING, CANCELLED, NO_FILE diff --git a/src/logs/guild_logs.py b/src/logs/guild_logs.py index 3c4af39..9aa0d1a 100644 --- a/src/logs/guild_logs.py +++ b/src/logs/guild_logs.py @@ -23,11 +23,12 @@ current_analysis_lock = threading.Lock() ALREADY_RUNNING = -100 CANCELLED = -200 +NO_FILE = -300 # 5 minutes, assume 'fast' arg MIN_MODIFICATION_TIME = 5 * 60 -# ~6 months, remove log file -MAX_MODIFICATION_TIME = 6 * 30.5 * 24 * 60 * 60 +# ~1 year, remove log file +MAX_MODIFICATION_TIME = 365 * 24 * 60 * 60 class Worker: @@ -110,52 +111,49 @@ class GuildLogs: if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) last_time = None - if os.path.exists(self.log_file): - channels = {} - try: - last_time = os.path.getmtime(self.log_file) - gziped_data = None - await code_message(progress, "Reading saved history (1/4)...") - t0 = datetime.now() - with open(self.log_file, mode="rb") as f: - gziped_data = f.read() - logging.info(f"log {self.guild.id} > read in {delta(t0):,}ms") - if self.check_cancelled(): - return CANCELLED, 0 - await code_message(progress, "Reading saved history (2/4)...") - t0 = datetime.now() - json_data = gzip.decompress(gziped_data) - del gziped_data - logging.info( - f"log {self.guild.id} > gzip decompress in {delta(t0):,}ms" - ) - if self.check_cancelled(): - return CANCELLED, 0 - await code_message(progress, "Reading saved history (3/4)...") - t0 = datetime.now() - channels = json.loads(json_data) - del json_data - logging.info(f"log {self.guild.id} > json parse in {delta(t0):,}ms") - if self.check_cancelled(): - return CANCELLED, 0 - await code_message(progress, "Reading saved history (4/4)...") - t0 = datetime.now() - self.channels = { - int(id): ChannelLogs(channels[id], self) for id in channels - } - # remove invalid format - self.channels = { - id: self.channels[id] - for id in self.channels - if self.channels[id].is_format() - } - logging.info(f"log {self.guild.id} > loaded in {delta(t0):,}ms") - except json.decoder.JSONDecodeError: - logging.error(f"log {self.guild.id} > invalid JSON") - except IOError: - logging.error(f"log {self.guild.id} > cannot read") - else: - fast = False + if not os.path.exists(self.log_file): + return NO_FILE, 0 + channels = {} + try: + last_time = os.path.getmtime(self.log_file) + gziped_data = None + await code_message(progress, "Reading saved history (1/4)...") + t0 = datetime.now() + with open(self.log_file, mode="rb") as f: + gziped_data = f.read() + logging.info(f"log {self.guild.id} > read in {delta(t0):,}ms") + if self.check_cancelled(): + return CANCELLED, 0 + await code_message(progress, "Reading saved history (2/4)...") + t0 = datetime.now() + json_data = gzip.decompress(gziped_data) + del gziped_data + logging.info(f"log {self.guild.id} > gzip decompress in {delta(t0):,}ms") + if self.check_cancelled(): + return CANCELLED, 0 + await code_message(progress, "Reading saved history (3/4)...") + t0 = datetime.now() + channels = json.loads(json_data) + del json_data + logging.info(f"log {self.guild.id} > json parse in {delta(t0):,}ms") + if self.check_cancelled(): + return CANCELLED, 0 + await code_message(progress, "Reading saved history (4/4)...") + t0 = datetime.now() + self.channels = { + int(id): ChannelLogs(channels[id], self) for id in channels + } + # remove invalid format + self.channels = { + id: self.channels[id] + for id in self.channels + if self.channels[id].is_format() + } + logging.info(f"log {self.guild.id} > loaded in {delta(t0):,}ms") + except json.decoder.JSONDecodeError: + logging.error(f"log {self.guild.id} > invalid JSON") + except IOError: + logging.error(f"log {self.guild.id} > cannot read") if len(target_channels) == 0: target_channels = ( @@ -327,6 +325,29 @@ class GuildLogs: reference=message, ) + @staticmethod + def init_log(guild: List[discord.Guild]): + if not os.path.exists(LOG_DIR): + os.mkdir(LOG_DIR) + filename = os.path.join(LOG_DIR, f"{guild.id}{LOG_EXT}") + if not os.path.exists(filename): + with open(filename, mode="wb") as f: + f.write(gzip.compress(bytes("{}", "utf-8"))) + logging.info(f"log {guild.id} > created") + else: + logging.info(f"log {guild.id} > already exists") + + @staticmethod + def remove_log(guild: List[discord.Guild]): + if not os.path.exists(LOG_DIR): + os.mkdir(LOG_DIR) + filename = os.path.join(LOG_DIR, f"{guild.id}{LOG_EXT}") + if os.path.exists(filename): + os.unlink(filename) + logging.info(f"log {guild.id} > removed") + else: + logging.info(f"log {guild.id} > does not exists") + @staticmethod def check_logs(guilds: List[discord.Guild]): logging.info(f"checking logs...") diff --git a/src/main.py b/src/main.py index 69f6b0a..9bd111c 100644 --- a/src/main.py +++ b/src/main.py @@ -6,7 +6,7 @@ if sys.version_info < (3, 7): print("Please upgrade your Python version to 3.7.0 or higher") sys.exit(1) -from utils import emojis +from utils import emojis, gdpr from scanners import ( EmotesScanner, FullScanner, @@ -57,7 +57,13 @@ bot.register_command( "(cancel|stop)", GuildLogs.cancel, "cancel: stop current analysis (not launched with fast)", - "```\n" + "%cancel: Stop current analysis (not launched with fast)\n" + "```", + "```\n%cancel: Stop current analysis (not launched with fast)\n```", +) +bot.register_command( + "gdpr", + gdpr.process, + "gdpr: displays GDPR information", + gdpr.HELP, ) bot.register_command( "words", diff --git a/src/scanners/scanner.py b/src/scanners/scanner.py index 9a7e712..772f63f 100644 --- a/src/scanners/scanner.py +++ b/src/scanners/scanner.py @@ -5,8 +5,15 @@ import logging import re import discord -from utils import no_duplicate, get_intro, delta -from logs import GuildLogs, ChannelLogs, MessageLog, ALREADY_RUNNING, CANCELLED +from utils import no_duplicate, get_intro, delta, gdpr +from logs import ( + GuildLogs, + ChannelLogs, + MessageLog, + ALREADY_RUNNING, + CANCELLED, + NO_FILE, +) class Scanner(ABC): @@ -106,6 +113,8 @@ class Scanner(ABC): "An analysis is already running on this server, please be patient.", reference=message, ) + elif total_msg == NO_FILE: + await message.channel.send(gdpr.TEXT) else: self.msg_count = 0 self.total_msg = 0 diff --git a/src/utils/gdpr.py b/src/utils/gdpr.py new file mode 100644 index 0000000..e19aa98 --- /dev/null +++ b/src/utils/gdpr.py @@ -0,0 +1,68 @@ +import discord + +from logs import GuildLogs + + +HELP = ( + "```\n" + + "%gdpr: Displays GDPR information\n" + + "arguments:\n" + + "* agree - agree to GDPR\n" + + "* revoke - remove this server's data\n" + + "```" +) + +TEXT = ( + "" + + "__**About Analyst-bot's data usage**__\n" + + "**TL;DR**\n" + + "Analyst-bot collects text message information. It does not share collected data with any third-party and data is retained 12 months or until the bot is leaving the guild/server.\n" + + "**Data collection**\n" + + "Analyst-bot collects a Discord guild/server's history when asked to.\n" + + "This includes:\n" + + "- Visible text channel names\n" + + "- Visible text messages: date and time of creation and edition, author, content, reactions and other available metadata (pinned, tts, etc.)\n" + + "This does __not__ includes:\n" + + "- Voice channels and not visible channels\n" + + "- Not visible text messages\n" + + "- Visible text messages' embedded content, images and other attachments\n" + + "**Data processing**\n" + + "Any data collected is only processed in order to produce a one-time report sent to the user immediately. No temporary data are retained.\n" + + "**Data storage and retain policy**\n" + + "Analyst-bot stores the collected data in files that are accessible by the software and its administrator only.\n" + + "Any collected data are retained maximum 12 months until deletion or when the bot is leaving a guild/server.\n" + + "**Data sharing**\n" + + "Analyst-bot does not share the data collected with any third-party.\n" + + "**Right to retract**\n" + + "If you want to have your data removed, you can use the `%gdpr revoke` command or remove this bot from your guild/server.\n" + + "**Terms agreement**\n" + + "By agreeing to these terms, you ensure having the legal age if you are in a country that does have one and you also ensure having the consent of every member involved.\n" + + "\n" + + "*If you want more information, please contact the creator of this bot: .*\n" + + "\n" + + "Type `%gdpr agree` to agree to these terms, `%gdpr revoke` to remove this guild/server's collected data or `%gdpr` to see this message again." +) + +AGREE_TEXT = "Thanks for agreeing for these terms, you can now run analysis on this guild/server." + +REVOKE_TEXT = "This guild/server's data has been deleted. To run new analysis you must agree to the terms again." + + +async def process(client: discord.client, message: discord.Message, *args: str): + args = list(args) + if len(args) == 1: + await message.channel.send(TEXT) + elif len(args) > 2: + await message.channel.send(f"Too many arguments", reference=message) + elif args[1] == "help": + await message.channel.send(HELP, reference=message) + elif args[1] in ["agree", "accept"]: + GuildLogs.init_log(message.channel.guild) + await message.channel.send(AGREE_TEXT, reference=message) + elif args[1] in ["revoke", "cancel"]: + GuildLogs.remove_log(message.channel.guild) + await message.channel.send(REVOKE_TEXT, reference=message) + else: + await message.channel.send( + f"Unrecognized argument: `{args[1]}`", reference=message + ) From 737806a4bacd7740ab076ccce3ae8dca8af00f56 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 15:00:53 +0200 Subject: [PATCH 08/20] updated readme --- README.md | 2 ++ src/utils/gdpr.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c130ef5..02b294d 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ * %first - read first message * %rand - read a random message * %last - read last message +* %gdpr - displays GDPR information * %emojis - rank emotes by their usage * arguments: * - top emojis, default is 20 @@ -107,6 +108,7 @@ python3 src/main.py * **v1.13** * improved scan `%words` * remove old and unused logs at start and guild leaving + * GDPR disclaimer before scanning * **v1.12** * more scans: `%words` * concurrent `fast` analysis diff --git a/src/utils/gdpr.py b/src/utils/gdpr.py index e19aa98..968582c 100644 --- a/src/utils/gdpr.py +++ b/src/utils/gdpr.py @@ -59,7 +59,7 @@ async def process(client: discord.client, message: discord.Message, *args: str): elif args[1] in ["agree", "accept"]: GuildLogs.init_log(message.channel.guild) await message.channel.send(AGREE_TEXT, reference=message) - elif args[1] in ["revoke", "cancel"]: + elif args[1] in ["revoke", "cancel", "remove", "delete"]: GuildLogs.remove_log(message.channel.guild) await message.channel.send(REVOKE_TEXT, reference=message) else: From 5f903db9297bd03cd0de3d3748556a386498cce8 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 15:02:08 +0200 Subject: [PATCH 09/20] updated version before forgeting --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 9bd111c..d2278e5 100644 --- a/src/main.py +++ b/src/main.py @@ -33,7 +33,7 @@ emojis.load_emojis() bot = Bot( "Discord Analyst", - "1.12", + "1.13", alias="%", ) From b7a6f3313ba62954bac78da5c148fd657e484652 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 15:34:03 +0200 Subject: [PATCH 10/20] factorized help and triple-quote multi-line --- src/scanners/__init__.py | 2 +- src/scanners/channels_scanner.py | 16 +++---- src/scanners/composition_scanner.py | 12 +---- src/scanners/emotes_scanner.py | 25 +++++----- src/scanners/first_scanner.py | 5 +- src/scanners/frequency_scanner.py | 14 ++---- src/scanners/full_scanner.py | 12 +---- src/scanners/history_scanner.py | 13 ----- src/scanners/last_scanner.py | 5 +- src/scanners/mentioned_scanner.py | 20 ++++---- src/scanners/mentions_scanner.py | 21 ++++----- src/scanners/messages_scanner.py | 16 +++---- src/scanners/presence_scanner.py | 12 +---- src/scanners/random_scanner.py | 5 +- src/scanners/reactions_scanner.py | 15 +++--- src/scanners/words_scanner.py | 21 ++++----- src/utils/gdpr.py | 73 ++++++++++++++--------------- src/utils/utils.py | 32 ++++++++++--- 18 files changed, 134 insertions(+), 185 deletions(-) diff --git a/src/scanners/__init__.py b/src/scanners/__init__.py index 21fd922..ed9141d 100644 --- a/src/scanners/__init__.py +++ b/src/scanners/__init__.py @@ -11,4 +11,4 @@ from .reactions_scanner import ReactionsScanner from .first_scanner import FirstScanner from .last_scanner import LastScanner from .random_scanner import RandomScanner -from .words_scanner import WordsScanner \ No newline at end of file +from .words_scanner import WordsScanner diff --git a/src/scanners/channels_scanner.py b/src/scanners/channels_scanner.py index b3e7763..c766fb4 100644 --- a/src/scanners/channels_scanner.py +++ b/src/scanners/channels_scanner.py @@ -8,21 +8,17 @@ import discord from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter -from utils import COMMON_HELP_ARGS, mention, channel_mention +from utils import generate_help, mention, channel_mention class ChannelsScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%chan: Rank channels by their messages\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top , default is 10\n" - + "* all/everyone - include bots\n" - + "Example: %chan 10 @user\n" - + "```" + return generate_help( + "chan", + "Rank channels by their messages", + args=[" - top , default is 10", "all/everyone - include bots"], + example="5 @user", ) def __init__(self): diff --git a/src/scanners/composition_scanner.py b/src/scanners/composition_scanner.py index daec2f7..a2f3822 100644 --- a/src/scanners/composition_scanner.py +++ b/src/scanners/composition_scanner.py @@ -8,21 +8,13 @@ import discord from .scanner import Scanner from data_types import Composition from logs import ChannelLogs, MessageLog -from utils import emojis, COMMON_HELP_ARGS +from utils import emojis, generate_help class CompositionScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%compo: Show composition statistics\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %compo #mychannel1 @user\n" - + "```" - ) + return generate_help("compo", "Show composition statistics") def __init__(self): super().__init__( diff --git a/src/scanners/emotes_scanner.py b/src/scanners/emotes_scanner.py index 8c7b93f..b126812 100644 --- a/src/scanners/emotes_scanner.py +++ b/src/scanners/emotes_scanner.py @@ -8,24 +8,23 @@ import discord from logs import ChannelLogs, MessageLog from data_types import Emote, get_emote_dict from .scanner import Scanner -from utils import emojis, COMMON_HELP_ARGS, plural, precise +from utils import emojis, generate_help, plural, precise class EmotesScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%emojis: Rank emojis by their usage\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top emojis, default is 20\n" - + "* all - list all common emojis in addition to this guild's\n" - + "* members - show top member for each emojis\n" - + "* sort:usage/reaction - other sorting methods\n" - + "* everyone - include bots\n" - + "Example: %emojis 10 all #mychannel1 #mychannel2 @user\n" - + "```" + return generate_help( + "emojis", + "Rank emojis by their usage", + args=[ + " - top emojis, default is 20", + "all - list all common emojis in addition to this guild's", + "members - show top member for each emojis", + "sort:usage/reaction - other sorting methods", + "everyone - include bots", + ], + example="10 all #mychannel1 #mychannel2 @user", ) def __init__(self): diff --git a/src/scanners/first_scanner.py b/src/scanners/first_scanner.py index 766b145..1048e2c 100644 --- a/src/scanners/first_scanner.py +++ b/src/scanners/first_scanner.py @@ -3,14 +3,13 @@ from typing import List # Custom libs from .history_scanner import HistoryScanner +from utils import generate_help class FirstScanner(HistoryScanner): @staticmethod def help() -> str: - return super(FirstScanner, FirstScanner).help( - cmd="first", text="Read first message" - ) + return generate_help("first", "Read first message") def __init__(self): super().__init__(help=FirstScanner.help()) diff --git a/src/scanners/frequency_scanner.py b/src/scanners/frequency_scanner.py index fac0a27..e0a4818 100644 --- a/src/scanners/frequency_scanner.py +++ b/src/scanners/frequency_scanner.py @@ -8,21 +8,13 @@ import discord from .scanner import Scanner from data_types import Frequency from logs import ChannelLogs, MessageLog -from utils import COMMON_HELP_ARGS +from utils import generate_help class FrequencyScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%freq: Show frequency-related statistics\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %freq #mychannel1 @user\n" - + "```" - ) + return generate_help("freq", "Show frequency-related statistics") def __init__(self): super().__init__( @@ -55,7 +47,7 @@ class FrequencyScanner(Scanner): freq: Frequency, raw_members: List[int], *, - all_messages: bool + all_messages: bool, ) -> bool: impacted = False # If author is included in the selection (empty list is all) diff --git a/src/scanners/full_scanner.py b/src/scanners/full_scanner.py index 22149bd..ac5cb0e 100644 --- a/src/scanners/full_scanner.py +++ b/src/scanners/full_scanner.py @@ -8,21 +8,13 @@ from .scanner import Scanner from . import FrequencyScanner, CompositionScanner, PresenceScanner from data_types import Frequency, Composition, Presence from logs import ChannelLogs, MessageLog -from utils import COMMON_HELP_ARGS +from utils import generate_help class FullScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%scan: Show full statistics\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %scan #mychannel1 @user\n" - + "```" - ) + return generate_help("scan", "Show full statistics") def __init__(self): super().__init__( diff --git a/src/scanners/history_scanner.py b/src/scanners/history_scanner.py index c61872e..5a3ae4c 100644 --- a/src/scanners/history_scanner.py +++ b/src/scanners/history_scanner.py @@ -7,22 +7,9 @@ import discord from .scanner import Scanner from data_types import History from logs import ChannelLogs, MessageLog -from utils import COMMON_HELP_ARGS class HistoryScanner(Scanner, ABC): - @staticmethod - def help(*, cmd: str, text: str) -> str: - return ( - "```\n" - + f"%{cmd}: {text}\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %{cmd} #mychannel1 @user\n" - + "```" - ) - def __init__(self, *, help: str): super().__init__( has_digit_args=True, diff --git a/src/scanners/last_scanner.py b/src/scanners/last_scanner.py index 7713195..3d8cbf0 100644 --- a/src/scanners/last_scanner.py +++ b/src/scanners/last_scanner.py @@ -3,14 +3,13 @@ from typing import List # Custom libs from .history_scanner import HistoryScanner +from utils import generate_help class LastScanner(HistoryScanner): @staticmethod def help() -> str: - return super(LastScanner, LastScanner).help( - cmd="last", text="Read last message" - ) + return generate_help("last", "Read last message") def __init__(self): super().__init__(help=LastScanner.help()) diff --git a/src/scanners/mentioned_scanner.py b/src/scanners/mentioned_scanner.py index 8cf74d4..fa6c09e 100644 --- a/src/scanners/mentioned_scanner.py +++ b/src/scanners/mentioned_scanner.py @@ -8,22 +8,18 @@ import discord from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter -from utils import COMMON_HELP_ARGS, plural, precise, mention, alt_mention +from utils import generate_help, plural, precise, mention, alt_mention class MentionedScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%mentioned: Rank specific user's mentions by their usage\n" - + "arguments:\n" - + "* @member/me - (required) one or more member\n" - + "\n".join(COMMON_HELP_ARGS.split("\n")[1:]) - + "* - top mentions, default is 10\n" - + "* all - include bots mentions\n" - + "Example: %mentioned 10 @user\n" - + "```" + return generate_help( + "mentioned", + "Rank specific user's mentions by their usage", + args=[" - top , default is 10", "all/everyone - include bots"], + example="5 @user", + replace_args=[" @member/me - (required) one or more member"], ) def __init__(self): @@ -45,7 +41,7 @@ class MentionedScanner(Scanner): "You need to mention at least one member or use `me`", reference=message ) return False - self.all_mentions = "all" in args + self.all_mentions = "all" in args or "everyone" in args # Create mentions dict self.mentions = defaultdict(Counter) return True diff --git a/src/scanners/mentions_scanner.py b/src/scanners/mentions_scanner.py index 8890390..50a0f5c 100644 --- a/src/scanners/mentions_scanner.py +++ b/src/scanners/mentions_scanner.py @@ -9,7 +9,7 @@ from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter from utils import ( - COMMON_HELP_ARGS, + generate_help, plural, precise, mention, @@ -22,16 +22,15 @@ from utils import ( class MentionsScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%mentions: Rank mentions by their usage\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top mentions, default is 10\n" - + "* all - show role/channel/everyone/here mentions\n" - + "* everyone - include bots mentions\n" - + "Example: %mentions 10 #mychannel1 #mychannel2 @user\n" - + "```" + return generate_help( + "mentions", + "Rank mentions by their usage", + args=[ + " - top , default is 10", + "all - show role/channel/everyone/here mentions", + "everyone - include bots mentions", + ], + example="10 #mychannel1 #mychannel2 @user", ) def __init__(self): diff --git a/src/scanners/messages_scanner.py b/src/scanners/messages_scanner.py index f576057..a79735e 100644 --- a/src/scanners/messages_scanner.py +++ b/src/scanners/messages_scanner.py @@ -8,21 +8,17 @@ import discord from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter -from utils import COMMON_HELP_ARGS, mention, channel_mention +from utils import generate_help, mention, channel_mention class MessagesScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%msg: Rank users by their messages\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top , default is 10\n" - + "* all/everyone - include bots\n" - + "Example: %msg 10 #channel\n" - + "```" + return generate_help( + "msg", + "Rank users by their messages", + args=[" - top , default is 10", "all/everyone - include bots"], + example="10 #channel", ) def __init__(self): diff --git a/src/scanners/presence_scanner.py b/src/scanners/presence_scanner.py index b19e723..5e39931 100644 --- a/src/scanners/presence_scanner.py +++ b/src/scanners/presence_scanner.py @@ -7,21 +7,13 @@ import discord from .scanner import Scanner from data_types import Presence from logs import ChannelLogs, MessageLog -from utils import COMMON_HELP_ARGS +from utils import generate_help class PresenceScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%pres: Show presence statistics\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %pres #mychannel1 @user\n" - + "```" - ) + return generate_help("pres", "Show presence statistics") def __init__(self): super().__init__( diff --git a/src/scanners/random_scanner.py b/src/scanners/random_scanner.py index 9ef520b..f4fb7a9 100644 --- a/src/scanners/random_scanner.py +++ b/src/scanners/random_scanner.py @@ -3,14 +3,13 @@ from typing import List # Custom libs from .history_scanner import HistoryScanner +from utils import generate_help class RandomScanner(HistoryScanner): @staticmethod def help() -> str: - return super(RandomScanner, RandomScanner).help( - cmd="rand", text="Read a random message" - ) + return generate_help("rand", "Read a random message") def __init__(self): super().__init__(help=RandomScanner.help()) diff --git a/src/scanners/reactions_scanner.py b/src/scanners/reactions_scanner.py index bb84387..3603a06 100644 --- a/src/scanners/reactions_scanner.py +++ b/src/scanners/reactions_scanner.py @@ -8,20 +8,17 @@ import discord from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter -from utils import COMMON_HELP_ARGS, mention, channel_mention +from utils import generate_help, mention, channel_mention class ReactionsScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%react: Rank users by their reactions\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top , default is 10\n" - + "Example: %react 10 #channel\n" - + "```" + return generate_help( + "react", + "Rank users by their reactions", + args=[" - top , default is 10"], + example="10 #channel", ) def __init__(self): diff --git a/src/scanners/words_scanner.py b/src/scanners/words_scanner.py index c31bf9b..f7f6dd7 100644 --- a/src/scanners/words_scanner.py +++ b/src/scanners/words_scanner.py @@ -9,7 +9,7 @@ from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter from utils import ( - COMMON_HELP_ARGS, + generate_help, plural, precise, ) @@ -18,16 +18,15 @@ from utils import ( class WordsScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%words: (BETA) Rank words by their usage\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - words containings or more letters, default is 3\n" - + "* - top words, default is 10\n" - + "* everyone - include bots\n" - + "Example: %words 5 10 #mychannel1 #mychannel2 @user\n" - + "```" + return generate_help( + "words", + "(BETA) Rank words by their usage", + args=[ + " - words containings or more letters, default is 3", + " - top words, default is 10", + "all/everyone - include bots", + ], + example="5 10 #mychannel1 #mychannel2 @user", ) def __init__(self): diff --git a/src/utils/gdpr.py b/src/utils/gdpr.py index 968582c..5ae85c6 100644 --- a/src/utils/gdpr.py +++ b/src/utils/gdpr.py @@ -3,45 +3,42 @@ import discord from logs import GuildLogs -HELP = ( - "```\n" - + "%gdpr: Displays GDPR information\n" - + "arguments:\n" - + "* agree - agree to GDPR\n" - + "* revoke - remove this server's data\n" - + "```" -) +HELP = """``` +%gdpr: Displays GDPR information +arguments: +* agree - agree to GDPR +* revoke - remove this server's data +```""" -TEXT = ( - "" - + "__**About Analyst-bot's data usage**__\n" - + "**TL;DR**\n" - + "Analyst-bot collects text message information. It does not share collected data with any third-party and data is retained 12 months or until the bot is leaving the guild/server.\n" - + "**Data collection**\n" - + "Analyst-bot collects a Discord guild/server's history when asked to.\n" - + "This includes:\n" - + "- Visible text channel names\n" - + "- Visible text messages: date and time of creation and edition, author, content, reactions and other available metadata (pinned, tts, etc.)\n" - + "This does __not__ includes:\n" - + "- Voice channels and not visible channels\n" - + "- Not visible text messages\n" - + "- Visible text messages' embedded content, images and other attachments\n" - + "**Data processing**\n" - + "Any data collected is only processed in order to produce a one-time report sent to the user immediately. No temporary data are retained.\n" - + "**Data storage and retain policy**\n" - + "Analyst-bot stores the collected data in files that are accessible by the software and its administrator only.\n" - + "Any collected data are retained maximum 12 months until deletion or when the bot is leaving a guild/server.\n" - + "**Data sharing**\n" - + "Analyst-bot does not share the data collected with any third-party.\n" - + "**Right to retract**\n" - + "If you want to have your data removed, you can use the `%gdpr revoke` command or remove this bot from your guild/server.\n" - + "**Terms agreement**\n" - + "By agreeing to these terms, you ensure having the legal age if you are in a country that does have one and you also ensure having the consent of every member involved.\n" - + "\n" - + "*If you want more information, please contact the creator of this bot: .*\n" - + "\n" - + "Type `%gdpr agree` to agree to these terms, `%gdpr revoke` to remove this guild/server's collected data or `%gdpr` to see this message again." -) +TEXT = """ +__**About Analyst-bot's data usage**__ +**TL;DR** +Analyst-bot collects text message information. It does not share collected data with any third-party and data is retained 18 months or until the bot is leaving the guild/server. +**Data collection** +Analyst-bot collects a Discord guild/server's history when asked to. +This includes: +- Visible text channel names +- Visible text messages: date and time of creation and edition, author, content, reactions and other available metadata (pinned, tts, etc.) +This does __not__ includes: +- Voice channels and not visible channels +- Not visible text messages +- Visible text messages' embedded content, images and other attachments +**Data processing** +Any data collected is only processed in order to produce a one-time report sent to the user immediately. No temporary data are retained. +**Data storage and retain policy** +Analyst-bot stores the collected data in files that are accessible by the software and its administrator only. +Any collected data are retained maximum 18 months until deletion or when the bot is leaving a guild/server. +**Data sharing** +Analyst-bot does not share the data collected with any third-party. +**Right to retract** +If you want to have your data removed, you can use the `%gdpr revoke` command or remove this bot from your guild/server. +**Terms agreement** +By agreeing to these terms, you ensure having the legal age if you are in a country that does have one and you also ensure having the consent of every member involved. + +*If you want more information, please contact the creator of this bot: .* + +Type `%gdpr agree` to agree to these terms, `%gdpr revoke` to remove this guild/server's collected data or `%gdpr` to see this message again. +""" AGREE_TEXT = "Thanks for agreeing for these terms, you can now run analysis on this guild/server." diff --git a/src/utils/utils.py b/src/utils/utils.py index 880d892..1447a40 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -7,13 +7,31 @@ from datetime import datetime # OTHER -COMMON_HELP_ARGS = ( - "" - + "* @member/me - filter for one or more member\n" - + "* #channel/here - filter for one or more channel\n" - + "* fast - only read cache\n" - + "* fresh - does not read cache (long)\n" -) +COMMON_HELP_ARGS = [ + "@member/me - filter for one or more member", + "#channel/here - filter for one or more channel", + "fast - only read cache", + "fresh - does not read cache (long)", +] + + +def generate_help( + cmd: str, + info: str, + *, + args=["all/everyone - include bots"], + example="#mychannel1 @user", + replace_args=[], +): + arg_list = "* " + "\n* ".join( + replace_args + COMMON_HELP_ARGS[len(replace_args) :] + args + ) + return f"""``` +%{cmd}: {info} +arguments: +{arg_list} +Example: %{cmd} {example} +```""" def delta(t0: datetime): From 2062f08721373e1ed4a5fd60b9b75ffa78674cab Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 17:39:42 +0200 Subject: [PATCH 11/20] start en stop dates --- README.md | 5 ++ requirements.txt | 1 + src/data_types/frequency.py | 7 ++- src/logs/channel_logs.py | 8 +++ src/scanners/scanner.py | 74 +++++++++++++++++++++++----- src/utils/utils.py | 98 +++++++++++++++++++++++++++---------- 6 files changed, 156 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 02b294d..0970c0a 100644 --- a/README.md +++ b/README.md @@ -53,9 +53,13 @@ * Common arguments: * @member/me: filter for one or more member * #channel/here: filter for one or more channel + * - filter after + * - filter before * all/everyone - include bots messages * fast: only read cache * fresh: does not read cache + +(Dates are formated 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm' (ISO 8601) or 'week/month/year') ``` ## Running this bot @@ -109,6 +113,7 @@ python3 src/main.py * improved scan `%words` * remove old and unused logs at start and guild leaving * GDPR disclaimer before scanning + * start and stop dates * **v1.12** * more scans: `%words` * concurrent `fast` analysis diff --git a/requirements.txt b/requirements.txt index 95a454b..7bc9d08 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ discord.py python-dotenv +python-dateutil git+git://github.com/Klemek/miniscord.git diff --git a/src/data_types/frequency.py b/src/data_types/frequency.py index 14cf5dd..075084c 100644 --- a/src/data_types/frequency.py +++ b/src/data_types/frequency.py @@ -38,7 +38,10 @@ class Frequency: *, member_specific: bool, ) -> List[str]: + self.dates.sort() delta = self.dates[-1] - self.dates[0] + if delta.days == 0: + delta = timedelta(days=1) total_msg = len(self.dates) busiest_weekday = top_key(self.week) busiest_hour = top_key(self.day) @@ -56,7 +59,9 @@ class Frequency: f"- **latest message**: {str_datetime(self.dates[-1])} ({from_now(self.dates[-1])})", f"- **messages/day**: {precise(total_msg/delta.days, precision=3)}", f"- **busiest day of week**: {calendar.day_name[busiest_weekday]} (~{precise(self.week[busiest_weekday]/n_weekdays, precision=3)} msg, {percent(self.week[busiest_weekday]/total_msg)})", - f"- **busiest day ever**: {str_date(self.busiest_day)} ({from_now(self.busiest_day)}, {self.busiest_day_count} msg)", + f"- **busiest day ever**: {str_date(self.busiest_day)} ({from_now(self.busiest_day)}, {self.busiest_day_count} msg)" + if self.busiest_day is not None + else "", f"- **messages/hour**: {precise(total_msg*3600/delta.total_seconds(), precision=3)}", f"- **busiest hour of day**: {busiest_hour:0>2}:00 (~{precise(self.day[busiest_hour]/n_hours, precision=3)} msg, {percent(self.day[busiest_hour]/total_msg)})", f"- **busiest hour ever**: {str_datetime(self.busiest_hour)} ({from_now(self.busiest_hour)}, {self.busiest_hour_count} msg)", diff --git a/src/logs/channel_logs.py b/src/logs/channel_logs.py index 86c7a28..83681e9 100644 --- a/src/logs/channel_logs.py +++ b/src/logs/channel_logs.py @@ -1,5 +1,6 @@ from typing import Union, Tuple, Any import discord +from discord import message from . import MessageLog from utils import FakeMessage @@ -17,6 +18,7 @@ class ChannelLogs: self.last_message_id = None self.format = FORMAT self.messages = [] + self.start_date = None elif isinstance(channel, dict): self.format = channel["format"] if "format" in channel else None if not self.is_format(): @@ -31,6 +33,9 @@ class ChannelLogs: self.messages = [ MessageLog(message, self) for message in channel["messages"] ] + self.start_date = ( + self.messages[-1].created_at if len(self.messages) > 0 else None + ) def is_format(self): return self.format == FORMAT @@ -80,6 +85,9 @@ class ChannelLogs: except discord.errors.HTTPException: yield -1, True return # When an exception occurs (like Forbidden) + self.start_date = ( + self.messages[-1].created_at if len(self.messages) > 0 else None + ) yield len(self.messages), True def dict(self) -> dict: diff --git a/src/scanners/scanner.py b/src/scanners/scanner.py index 772f63f..8931318 100644 --- a/src/scanners/scanner.py +++ b/src/scanners/scanner.py @@ -5,7 +5,16 @@ import logging import re import discord -from utils import no_duplicate, get_intro, delta, gdpr + +from utils import ( + no_duplicate, + get_intro, + delta, + gdpr, + ISO8601_REGEX, + parse_time, + RELATIVE_TIME, +) from logs import ( GuildLogs, ChannelLogs, @@ -54,22 +63,42 @@ class Scanner(ABC): str(channel.id) for channel in message.channel_mentions ] str_mentions = [str(member.id) for member in message.mentions] + dates = [] for i, arg in enumerate(args[1:]): + skip_check = False if re.match(r"^<@!?\d+>$", arg): arg = arg[3:-1] if "!" in arg else arg[2:-1] elif re.match(r"^<#!?\d+>$", arg): arg = arg[3:-1] if "!" in arg else arg[2:-1] + elif re.match(ISO8601_REGEX, arg) or arg in RELATIVE_TIME: + dates += [parse_time(arg)] + skip_check = True + if len(dates) > 2: + await message.channel.send( + f"Too many date arguments: `{arg}`", reference=message + ) + return if ( arg not in self.valid_args + ["me", "here", "fast", "fresh"] and (not arg.isdigit() or not self.has_digit_args) and arg not in str_channel_mentions and arg not in str_mentions + and not skip_check ): await message.channel.send( f"Unrecognized argument: `{arg}`", reference=message ) return + self.start_datetime = None if len(dates) < 1 else min(dates) + self.stop_datetime = datetime.now() if len(dates) < 2 else max(dates) + + if self.start_datetime is not None and self.start_datetime > datetime.now(): + await message.channel.send( + f"Start date is after today", reference=message + ) + return + # Get selected channels or all of them if no channel arguments self.channels = no_duplicate(message.channel_mentions) @@ -103,6 +132,18 @@ class Scanner(ABC): total_msg, total_chan = await logs.load( progress, self.channels, fast="fast" in args, fresh="fresh" in args ) + if self.start_datetime is not None: + self.start_datetime = max( + self.start_datetime, + min( + [ + logs.channels[channel.id].start_date + for channel in self.channels + if channel.id in logs.channels + and logs.channels[channel.id].start_date is not None + ] + ), + ) if total_msg == CANCELLED: await message.channel.send( "Operation cancelled by user", @@ -127,13 +168,21 @@ class Scanner(ABC): [ self.compute_message(channel_logs, message_log) for message_log in channel_logs.messages + if ( + self.start_datetime is None + or message_log.created_at >= self.start_datetime + ) + and ( + self.stop_datetime is None + or message_log.created_at <= self.stop_datetime + ) ] ) self.total_msg += len(channel_logs.messages) self.msg_count += count self.chan_count += 1 if count > 0 else 0 logging.info(f"scan {guild.id} > scanned in {delta(t0):,}ms") - if self.total_msg == 0: + if self.msg_count == 0: await message.channel.send( "There are no messages found matching the filters", reference=message, @@ -150,21 +199,24 @@ class Scanner(ABC): self.members, self.msg_count, self.chan_count, + self.start_datetime, + self.stop_datetime, ) ) logging.info(f"scan {guild.id} > results in {delta(t0):,}ms") response = "" first = True for r in results: - if len(response + "\n" + r) > 2000: - await message.channel.send( - response, - reference=message if first else None, - allowed_mentions=discord.AllowedMentions.none(), - ) - first = False - response = "" - response += "\n" + r + if r: + if len(response + "\n" + r) > 2000: + await message.channel.send( + response, + reference=message if first else None, + allowed_mentions=discord.AllowedMentions.none(), + ) + first = False + response = "" + response += "\n" + r if len(response) > 0: await message.channel.send( response, diff --git a/src/utils/utils.py b/src/utils/utils.py index 1447a40..a8f2ad5 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -1,15 +1,21 @@ +from calendar import month from typing import List, Dict, Union, Optional, Any import os import logging import discord import math -from datetime import datetime +from datetime import datetime, timedelta +import re +import dateutil.parser +from dateutil.relativedelta import relativedelta # OTHER COMMON_HELP_ARGS = [ "@member/me - filter for one or more member", "#channel/here - filter for one or more channel", + " - filter after ", + " - filter before ", "fast - only read cache", "fresh - does not read cache (long)", ] @@ -30,6 +36,7 @@ def generate_help( %{cmd}: {info} arguments: {arg_list} +(Dates are formated 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm' (ISO 8601) or 'week/month/year') Example: %{cmd} {example} ```""" @@ -153,6 +160,37 @@ def precise(p: float, *, precision: int = 2) -> str: # DATE FORMATTING +ISO8601_REGEX = r"^([\+-]?\d{4}(?!\d{2}\b))((-?)((0[1-9]|1[0-2])(\3([12]\d|0[1-9]|3[01]))?|W([0-4]\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\d|[12]\d{2}|3([0-5]\d|6[1-6])))([T\s]((([01]\d|2[0-3])((:?)[0-5]\d)?|24\:?00)([\.,]\d+(?!:))?)?(\17[0-5]\d([\.,]\d+)?)?([zZ]|([\+-])([01]\d|2[0-3]):?([0-5]\d)?)?)?)?$" +ISO8601_FULL = "0000-01-01T00:00:00" + + +def parse_iso_datetime(str_date: str) -> datetime: + if re.match( + "^\d{4}(-\d{2}(-\d{2}(T\d{2}(:\d{2}(:\d{2}(:\d{2})?)?)?)?)?)?$", str_date + ): + str_date = str_date + "0000-01-01T00:00:00"[len(str_date) :] + return dateutil.parser.parse(str_date) + + +RELATIVE_TIME = { + "today": relativedelta(days=1), + "yesterday": relativedelta(days=2), + "week": relativedelta(weeks=1), + "month": relativedelta(months=1), + "year": relativedelta(years=1), +} + + +def parse_relative_time(src: str) -> datetime: + return datetime.utcnow() - RELATIVE_TIME[src] + + +def parse_time(src: str) -> datetime: + if src in RELATIVE_TIME: + return parse_relative_time(src) + else: + return parse_iso_datetime(src) + def str_date(date: datetime) -> str: return date.strftime("%d %b. %Y") # 12 Jun. 2018 @@ -162,29 +200,37 @@ def str_datetime(date: datetime) -> str: return date.strftime("%H:%M, %d %b. %Y") # 12:05, 12 Jun. 2018 -def from_now(src: Optional[datetime]) -> str: - if src is None: - return "never" - delay = datetime.utcnow() - src +def str_delta(delay: timedelta) -> str: seconds = delay.seconds minutes = seconds // 60 hours = minutes // 60 if delay.days < 1: if hours < 1: if minutes == 0: - return "now" + return "no time" elif minutes == 1: - return "a minute ago" + return "a minute" else: - return f"{minutes} minutes ago" + return f"{minutes} minutes" elif hours == 1: - return "an hour ago" + return "an hour" else: - return f"{hours} hours ago" + return f"{hours} hours" elif delay.days == 1: - return "yesterday" + return "one day" else: - return f"{delay.days:,} days ago" + return f"{delay.days:,} days" + + +def from_now(src: Optional[datetime]) -> str: + if src is None: + return "never" + output = str_delta(datetime.utcnow() - src) + if output == "no time": + return "now" + elif output == "one day": + return "yesterday" + return output + " ago" # APP SPECIFIC @@ -197,46 +243,48 @@ def get_intro( members: List[discord.Member], nmm: int, # number of messages impacted nc: int, # number of impacted channels + start_datetime: datetime, + stop_datetime: datetime, ) -> str: """ Get the introduction sentence of the response """ + time_text = "" + if start_datetime is not None: + stop_datetime = datetime.now() if stop_datetime is None else stop_datetime + time_text = f" (in {str_delta(stop_datetime - start_datetime)})" # Show all data (members, channels) when it's less than 5 units if len(members) == 0: # Full scan of the server if full: - return f"{subject} in this server ({nc} channels, {nmm:,} messages):" + return f"{subject} in this server ({nc} channels, {nmm:,} messages){time_text}:" elif len(channels) < 5: - return f"{aggregate([c.mention for c in channels])} {subject.lower()} in {nmm:,} messages:" + return f"{aggregate([c.mention for c in channels])} {subject.lower()} in {nmm:,} messages{time_text}:" else: - return ( - f"These {len(channels)} channels {subject.lower()} in {nmm:,} messages:" - ) + return f"These {len(channels)} channels {subject.lower()} in {nmm:,} messages{time_text}:" elif len(members) < 5: if full: - return f"{aggregate([m.mention for m in members])} {subject.lower()} in {nmm:,} messages:" + return f"{aggregate([m.mention for m in members])} {subject.lower()} in {nmm:,} messages{time_text}:" elif len(channels) < 5: return ( f"{aggregate([m.mention for m in members])} on {aggregate([c.mention for c in channels])} " - f"{subject.lower()} in {nmm:,} messages:" + f"{subject.lower()} in {nmm:,} messages{time_text}:" ) else: return ( f"{aggregate([m.mention for m in members])} on these {len(channels)} channels " - f"{subject.lower()} in {nmm:,} messages:" + f"{subject.lower()} in {nmm:,} messages{time_text}:" ) else: if full: - return ( - f"These {len(members)} members {subject.lower()} in {nmm:,} messages:" - ) + return f"These {len(members)} members {subject.lower()} in {nmm:,} messages{time_text}:" elif len(channels) < 5: return ( f"These {len(members)} members on {aggregate([c.mention for c in channels])} " - f"{subject.lower()} in {nmm:,} messages:" + f"{subject.lower()} in {nmm:,} messages{time_text}:" ) else: return ( f"These {len(members)} members on these {len(channels)} channels " - f"{subject.lower()} in {nmm:,} messages:" + f"{subject.lower()} in {nmm:,} messages{time_text}:" ) From 90a26bcc9ce47f50493e2585d1f821f8ea5f2d73 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 18:04:36 +0200 Subject: [PATCH 12/20] flattened results in data_type --- src/data_types/composition.py | 84 +++++++++++------------ src/data_types/frequency.py | 11 +-- src/data_types/presence.py | 124 ++++++++++++++++------------------ src/logs/channel_logs.py | 1 + src/utils/utils.py | 14 +++- 5 files changed, 115 insertions(+), 119 deletions(-) diff --git a/src/data_types/composition.py b/src/data_types/composition.py index e2c0a1c..69364a1 100644 --- a/src/data_types/composition.py +++ b/src/data_types/composition.py @@ -23,49 +23,45 @@ class Composition: self.spoilers = 0 def to_string(self, msg_count: int) -> List[str]: - ret = [] - ret += [ - f"- **avg. characters / message**: {self.total_characters/msg_count:.2f}" - ] - if self.plain_text > 0: - ret += [ - f"- **plain text messages**: {self.plain_text:,} ({percent(self.plain_text/msg_count)})" - ] - if self.edited > 0: - ret += [ - f"- **edited messages**: {self.edited:,} ({percent(self.edited/msg_count)})" - ] - if self.everyone > 0: - ret += [ - f"- **@\u200beveryone**: {self.everyone:,} ({percent(self.everyone/msg_count)})" - ] - if self.mentions > 0: - ret += [ - f"- **mentions**: {self.mentions:,} (in {percent(self.mention_msg/msg_count)} of msg, avg. {precise(self.mentions/msg_count)}/msg)", - ] - if self.answers > 0: - ret += [ - f"- **answers**: {self.answers:,} ({percent(self.answers/msg_count)})" - ] total_emotes = val_sum(self.emotes) - if total_emotes > 0: - top_emote = top_key(self.emotes) - ret += [ - f"- **emojis**: {total_emotes:,} (in {percent(self.emote_msg/msg_count)} of msg, avg. {precise(total_emotes/msg_count)}/msg)", - f"- **most used emoji**: {top_emote} ({plural(self.emotes[top_emote], 'time')}, {percent(self.emotes[top_emote]/total_emotes)})", - ] - if self.emote_only > 0: - ret += [ - f"- **emoji-only messages**: {self.emote_only:,} ({percent(self.emote_only/msg_count)})" - ] - if self.images > 0: - ret += [f"- **images**: {self.images:,} ({percent(self.images/msg_count)})"] - if self.links > 0: - ret += [f"- **links**: {self.links:,} ({percent(self.link_msg/msg_count)})"] - if self.spoilers > 0: - ret += [ - f"- **spoilers**: {self.spoilers:,} ({percent(self.spoilers/msg_count)})" - ] - if self.tts > 0: - ret += [f"- **tts messages**: {self.tts:,} ({percent(self.tts/msg_count)})"] + top_emote = top_key(self.emotes) + ret = [ + f"- **avg. characters / message**: {self.total_characters/msg_count:.2f}", + f"- **plain text messages**: {self.plain_text:,} ({percent(self.plain_text/msg_count)})" + if self.plain_text > 0 + else "", + f"- **edited messages**: {self.edited:,} ({percent(self.edited/msg_count)})" + if self.edited > 0 + else "", + f"- **@\u200beveryone**: {self.everyone:,} ({percent(self.everyone/msg_count)})" + if self.everyone > 0 + else "", + f"- **mentions**: {self.mentions:,} (in {percent(self.mention_msg/msg_count)} of msg, avg. {precise(self.mentions/msg_count)}/msg)" + if self.mentions > 0 + else "", + f"- **answers**: {self.answers:,} ({percent(self.answers/msg_count)})" + if self.answers > 0 + else "", + f"- **emojis**: {total_emotes:,} (in {percent(self.emote_msg/msg_count)} of msg, avg. {precise(total_emotes/msg_count)}/msg)" + if total_emotes > 0 + else "", + f"- **most used emoji**: {top_emote} ({plural(self.emotes[top_emote], 'time')}, {percent(self.emotes[top_emote]/total_emotes)})" + if total_emotes > 0 + else "", + f"- **emoji-only messages**: {self.emote_only:,} ({percent(self.emote_only/msg_count)})" + if self.emote_only > 0 + else "", + f"- **images**: {self.images:,} ({percent(self.images/msg_count)})" + if self.images > 0 + else "", + f"- **links**: {self.links:,} ({percent(self.link_msg/msg_count)})" + if self.links > 0 + else "", + f"- **spoilers**: {self.spoilers:,} ({percent(self.spoilers/msg_count)})" + if self.spoilers > 0 + else "", + f"- **tts messages**: {self.tts:,} ({percent(self.tts/msg_count)})" + if self.tts > 0 + else "", + ] return ret diff --git a/src/data_types/frequency.py b/src/data_types/frequency.py index 075084c..b0f00ec 100644 --- a/src/data_types/frequency.py +++ b/src/data_types/frequency.py @@ -67,13 +67,8 @@ class Frequency: f"- **busiest hour ever**: {str_datetime(self.busiest_hour)} ({from_now(self.busiest_hour)}, {self.busiest_hour_count} msg)", f"- **longest break**: {plural(round(self.longest_break.total_seconds()/3600), 'hour')} ({plural(self.longest_break.days,'day')}) from {str_datetime(self.longest_break_start)} ({from_now(self.longest_break_start)})", f"- **avg. streak**: {precise(sum(self.streaks)/len(self.streaks), precision=3)} msg", + f"- **longest streak**: {self.longest_streak:,} msg from {str_datetime(self.longest_streak_start)} ({from_now(self.longest_streak_start)})" + if member_specific + else f"- **longest streak**: {mention(self.longest_streak_author)} ({self.longest_streak:,} msg from {str_datetime(self.longest_streak_start)}, {from_now(self.longest_streak_start)})", ] - if member_specific: - ret += [ - f"- **longest streak**: {self.longest_streak:,} msg from {str_datetime(self.longest_streak_start)} ({from_now(self.longest_streak_start)})" - ] - else: - ret += [ - f"- **longest streak**: {mention(self.longest_streak_author)} ({self.longest_streak:,} msg from {str_datetime(self.longest_streak_start)}, {from_now(self.longest_streak_start)})" - ] return ret diff --git a/src/data_types/presence.py b/src/data_types/presence.py index 778881a..682774b 100644 --- a/src/data_types/presence.py +++ b/src/data_types/presence.py @@ -25,74 +25,70 @@ class Presence: show_top_channel: bool, member_specific: bool, ) -> List[str]: - ret = [] if chan_count is None: type = "server's" elif chan_count == 1: type = "channel's" else: type = "channels'" - if member_specific: - ret += [ - f"- **messages**: {msg_count:,} ({percent(msg_count/total_msg)} of {type})" - ] - else: - top_member = top_key(self.messages) - ret += [ - f"- **top messages**: {mention(top_member)} ({self.messages[top_member]:,} msg, {percent(self.messages[top_member]/val_sum(self.messages))})" - ] - if show_top_channel: - top_channel = top_key(self.channel_usage) - channel_sum = val_sum(self.channel_usage) - found_in = sorted( - self.channel_usage, - key=lambda k: self.channel_usage[k] / self.channel_total[k], - )[-1] - ret += [ - f"- **most visited channel**: {channel_mention(top_channel)} ({self.channel_usage[top_channel]:,} msg, {percent(self.channel_usage[top_channel]/channel_sum)})", - ] - if member_specific: - ret += [ - f"- **most contributed channel**: {channel_mention(found_in)} ({self.channel_usage[found_in]:,} msg, {percent(self.channel_usage[found_in]/self.channel_total[found_in])} of {type})" - ] - if member_specific: - if len(self.mentions) > 0: - top_mention = top_key(self.mentions) - mention_sum = val_sum(self.mentions) - ret += [ - f"- **was mentioned**: {plural(mention_sum, 'time')} ({percent(mention_sum/val_sum(self.mention_count))} of {type})", - f"- **mostly mentioned by**: {mention(top_mention)} ({plural(self.mentions[top_mention], 'time')}, {percent(self.mentions[top_mention]/mention_sum)})", - ] - if len(self.mention_others) > 0: - top_mention = top_key(self.mention_others) - mention_sum = val_sum(self.mention_others) - if member_specific: - ret += [ - f"- **mentioned others**: {plural(mention_sum, 'time')} ({percent(mention_sum/val_sum(self.mention_count))} of {type})", - f"- **mostly mentioned**: {mention(top_mention)} ({plural(self.mention_others[top_mention], 'time')}, {percent(self.mention_others[top_mention]/mention_sum)})", - ] - else: - top_member = top_key(self.mention_count) - ret += [ - f"- **mentioned**: {plural(mention_sum, 'time')} ({mention(top_member)}, {percent(self.mention_count[top_member]/val_sum(self.mention_count))})", - f"- **top mentions**: {mention(top_member)} ({plural(self.mention_count[top_member], 'time')}, {percent(self.mention_count[top_member]/val_sum(self.mention_count))})", - f"- **most mentioned**: {mention(top_mention)} ({plural(self.mention_others[top_mention], 'time')}, {percent(self.mention_others[top_mention]/mention_sum)})", - ] - if len(self.reactions) > 0: - total_used = val_sum(self.reactions) - top_reaction = top_key(self.reactions) - ret += [ - f"- **reactions**: {plural(total_used, 'time')}", - f"- **most used reaction**: {top_reaction} ({plural(self.reactions[top_reaction], 'time')}, {percent(self.reactions[top_reaction]/total_used)})", - ] - if member_specific: - ret[ - -2 - ] += f" ({percent(total_used/val_sum(self.used_reaction))} of {type})" - else: - top_member = top_key(self.used_reaction) - ret.insert( - -1, - f"- **top reactions**: {mention(top_member)} ({plural(self.used_reaction[top_member], 'time')}, {percent(self.used_reaction[top_member]/val_sum(self.used_reaction))})", - ) + top_member = top_key(self.messages) + top_channel = top_key(self.channel_usage) + channel_sum = val_sum(self.channel_usage) + found_in = top_key( + self.channel_usage, + key=lambda k: self.channel_usage[k] / self.channel_total[k], + ) + top_mention = top_key(self.mentions) + mention_sum = val_sum(self.mentions) + top_mention_others = top_key(self.mention_others) + mention_others_sum = val_sum(self.mention_others) + top_member_mentioned = top_key(self.mention_count) + total_reaction_used = val_sum(self.reactions) + top_reaction = top_key(self.reactions) + top_reaction_member = top_key(self.used_reaction) + + ret = [ + f"- **messages**: {msg_count:,} ({percent(msg_count/total_msg)} of {type})" + if member_specific + else f"- **top messages**: {mention(top_member)} ({self.messages[top_member]:,} msg, {percent(self.messages[top_member]/val_sum(self.messages))})", + f"- **most visited channel**: {channel_mention(top_channel)} ({self.channel_usage[top_channel]:,} msg, {percent(self.channel_usage[top_channel]/channel_sum)})" + if show_top_channel + else "", + f"- **most contributed channel**: {channel_mention(found_in)} ({self.channel_usage[found_in]:,} msg, {percent(self.channel_usage[found_in]/self.channel_total[found_in])} of {type})" + if show_top_channel and member_specific + else "", + f"- **was mentioned**: {plural(mention_sum, 'time')} ({percent(mention_sum/val_sum(self.mention_count))} of {type})" + if member_specific and len(self.mentions) > 0 + else "", + f"- **mostly mentioned by**: {mention(top_mention)} ({plural(self.mentions[top_mention], 'time')}, {percent(self.mentions[top_mention]/mention_sum)})" + if member_specific and len(self.mentions) > 0 + else "", + f"- **mentioned others**: {plural(mention_others_sum, 'time')} ({percent(mention_others_sum/val_sum(self.mention_count))} of {type})" + if len(self.mention_others) > 0 and member_specific + else "", + f"- **mostly mentioned**: {mention(top_mention_others)} ({plural(self.mention_others[top_mention_others], 'time')}, {percent(self.mention_others[top_mention_others]/mention_others_sum)})" + if len(self.mention_others) > 0 and member_specific + else "", + f"- **mentioned**: {plural(mention_others_sum, 'time')} ({mention(top_member_mentioned)}, {percent(self.mention_count[top_member_mentioned]/val_sum(self.mention_count))})" + if len(self.mention_others) > 0 and not member_specific + else "", + f"- **top mentions**: {mention(top_member_mentioned)} ({plural(self.mention_count[top_member_mentioned], 'time')}, {percent(self.mention_count[top_member_mentioned]/val_sum(self.mention_count))})" + if len(self.mention_others) > 0 and not member_specific + else "", + f"- **most mentioned**: {mention(top_mention_others)} ({plural(self.mention_others[top_mention_others], 'time')}, {percent(self.mention_others[top_mention_others]/mention_others_sum)})" + if len(self.mention_others) > 0 and not member_specific + else "", + f"- **reactions**: {plural(total_reaction_used, 'time')}" + if len(self.reactions) > 0 and not member_specific + else "", + f"- **reactions**: {plural(total_reaction_used, 'time')} ({percent(total_reaction_used/val_sum(self.used_reaction))} of {type})" + if len(self.reactions) > 0 and member_specific + else "", + f"- **top reactions**: {mention(top_reaction_member)} ({plural(self.used_reaction[top_reaction_member], 'time')}, {percent(self.used_reaction[top_reaction_member]/val_sum(self.used_reaction))})" + if len(self.reactions) > 0 and not member_specific + else "", + f"- **most used reaction**: {top_reaction} ({plural(self.reactions[top_reaction], 'time')}, {percent(self.reactions[top_reaction]/total_reaction_used)})" + if len(self.reactions) > 0 + else "", + ] return ret diff --git a/src/logs/channel_logs.py b/src/logs/channel_logs.py index 83681e9..99fbfdb 100644 --- a/src/logs/channel_logs.py +++ b/src/logs/channel_logs.py @@ -94,5 +94,6 @@ class ChannelLogs: channel = dict(self.__dict__) channel.pop("channel", None) channel.pop("guild", None) + channel.pop("start_date", None) channel["messages"] = [message.dict() for message in self.messages] return channel diff --git a/src/utils/utils.py b/src/utils/utils.py index a8f2ad5..a823399 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -1,5 +1,5 @@ from calendar import month -from typing import List, Dict, Union, Optional, Any +from typing import Callable, List, Dict, Union, Optional, Any import os import logging import discord @@ -117,11 +117,19 @@ def no_duplicate(seq: list) -> list: # DICTS -def top_key(d: Dict[Union[str, int], int]) -> Union[str, int]: - return sorted(d, key=lambda k: d[k])[-1] +def top_key( + d: Dict[Union[str, int], int], key: Optional[Callable] = None +) -> Union[str, int]: + if len(d) == 0: + return None + if key is None: + key = lambda k: d[k] + return sorted(d, key=key)[-1] def val_sum(d: Dict[Any, int]) -> int: + if len(d) == 0: + return 0 return sum(d.values()) From 802e20809270c2ff8a19198b5ef03950a7d8ad72 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 18:19:40 +0200 Subject: [PATCH 13/20] alternative syntax for relative time range --- README.md | 2 +- src/scanners/scanner.py | 4 ++-- src/utils/utils.py | 36 ++++++++++++++++++++++++++---------- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 0970c0a..6f79ee0 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ * fast: only read cache * fresh: does not read cache -(Dates are formated 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm' (ISO 8601) or 'week/month/year') +(Dates are formated 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm' (ISO 8601) or 'week' or '8days' or '1y') ``` ## Running this bot diff --git a/src/scanners/scanner.py b/src/scanners/scanner.py index 8931318..f30cb11 100644 --- a/src/scanners/scanner.py +++ b/src/scanners/scanner.py @@ -12,8 +12,8 @@ from utils import ( delta, gdpr, ISO8601_REGEX, + RELATIVE_REGEX, parse_time, - RELATIVE_TIME, ) from logs import ( GuildLogs, @@ -70,7 +70,7 @@ class Scanner(ABC): arg = arg[3:-1] if "!" in arg else arg[2:-1] elif re.match(r"^<#!?\d+>$", arg): arg = arg[3:-1] if "!" in arg else arg[2:-1] - elif re.match(ISO8601_REGEX, arg) or arg in RELATIVE_TIME: + elif re.match(ISO8601_REGEX, arg) or re.match(RELATIVE_REGEX, arg): dates += [parse_time(arg)] skip_check = True if len(dates) > 2: diff --git a/src/utils/utils.py b/src/utils/utils.py index a823399..b18265a 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -36,7 +36,7 @@ def generate_help( %{cmd}: {info} arguments: {arg_list} -(Dates are formated 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm' (ISO 8601) or 'week/month/year') +(Dates are formated 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm' (ISO 8601) or 'week' or '8days' or '1y') Example: %{cmd} {example} ```""" @@ -180,21 +180,37 @@ def parse_iso_datetime(str_date: str) -> datetime: return dateutil.parser.parse(str_date) -RELATIVE_TIME = { - "today": relativedelta(days=1), - "yesterday": relativedelta(days=2), - "week": relativedelta(weeks=1), - "month": relativedelta(months=1), - "year": relativedelta(years=1), -} +RELATIVE_REGEX = ( + r"(yesterday|today|\d*h(ours?)?|\d*d(ays?)?|\d*w(eeks?)?|\d*m(onths?)?|\d*y(ears?))" +) def parse_relative_time(src: str) -> datetime: - return datetime.utcnow() - RELATIVE_TIME[src] + timezone_delta = datetime.utcnow() - datetime.now() + if src == "today": + return datetime.today() + timezone_delta + elif src == "yesterday": + return datetime.today() - relativedelta(days=1) + timezone_delta + else: + m = re.match("(\d*)(\w+)", src) + delta = None + value = int(m[1]) + unit = m[2][0] + if unit == "h": + delta = relativedelta(hours=value) + elif unit == "d": + delta = relativedelta(days=value) + elif unit == "w": + delta = relativedelta(weeks=value) + elif unit == "m": + delta = relativedelta(months=value) + elif unit == "y": + delta = relativedelta(years=value) + return datetime.utcnow() - delta def parse_time(src: str) -> datetime: - if src in RELATIVE_TIME: + if re.match(RELATIVE_REGEX, src): return parse_relative_time(src) else: return parse_iso_datetime(src) From 8c0605797a4892a5914edc74ab7897f7c3613e80 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 18:23:46 +0200 Subject: [PATCH 14/20] clarified dates syntax --- README.md | 2 +- src/utils/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6f79ee0..b3cae58 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ * fast: only read cache * fresh: does not read cache -(Dates are formated 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm' (ISO 8601) or 'week' or '8days' or '1y') +(Sample dates: 2020 / 2021-11 / 2021-06-28 / 2020-06-28T23:00 / today / week / 8days / 1y) ``` ## Running this bot diff --git a/src/utils/utils.py b/src/utils/utils.py index b18265a..e163f04 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -36,7 +36,7 @@ def generate_help( %{cmd}: {info} arguments: {arg_list} -(Dates are formated 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm' (ISO 8601) or 'week' or '8days' or '1y') +(Sample dates: 2020 / 2021-11 / 2021-06-28 / 2020-06-28T23:00 / today / week / 8days / 1y) Example: %{cmd} {example} ```""" From 5c570ee09b9996b8a6bf03ef70a302252ffb6ca3 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 18:25:51 +0200 Subject: [PATCH 15/20] fix no value in relative time --- src/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/utils.py b/src/utils/utils.py index e163f04..729d633 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -194,7 +194,7 @@ def parse_relative_time(src: str) -> datetime: else: m = re.match("(\d*)(\w+)", src) delta = None - value = int(m[1]) + value = int(m[1]) if m[1] else 1 unit = m[2][0] if unit == "h": delta = relativedelta(hours=value) From 09161850c535f90c8434c82a39aa177877b8eb57 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 18:29:27 +0200 Subject: [PATCH 16/20] clarified not serialized attributes --- README.md | 1 + src/logs/channel_logs.py | 7 ++++--- src/logs/message_log.py | 6 +++++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b3cae58..ad239ac 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ python3 src/main.py * remove old and unused logs at start and guild leaving * GDPR disclaimer before scanning * start and stop dates + * bug fix and improvements * **v1.12** * more scans: `%words` * concurrent `fast` analysis diff --git a/src/logs/channel_logs.py b/src/logs/channel_logs.py index 99fbfdb..39d854b 100644 --- a/src/logs/channel_logs.py +++ b/src/logs/channel_logs.py @@ -8,6 +8,8 @@ from utils import FakeMessage CHUNK_SIZE = 2000 FORMAT = 3 +NOT_SERIALIZED = ["channel", "guild", "start_date"] + class ChannelLogs: def __init__(self, channel: Union[discord.TextChannel, dict], guild: Any): @@ -92,8 +94,7 @@ class ChannelLogs: def dict(self) -> dict: channel = dict(self.__dict__) - channel.pop("channel", None) - channel.pop("guild", None) - channel.pop("start_date", None) + for key in NOT_SERIALIZED: + channel.pop(key, None) channel["messages"] = [message.dict() for message in self.messages] return channel diff --git a/src/logs/message_log.py b/src/logs/message_log.py index f534155..263c245 100644 --- a/src/logs/message_log.py +++ b/src/logs/message_log.py @@ -8,6 +8,9 @@ IMAGE_FORMAT = [".gif", ".gifv", ".png", ".jpg", ".jpeg", ".bmp"] EMBED_IMAGES = ["image", "gifv"] +NOT_SERIALIZED = ["channel"] + + class MessageLog: def __init__(self, message: Union[discord.Message, dict], channel: Any): self.channel = channel @@ -79,7 +82,8 @@ class MessageLog: def dict(self) -> dict: message = dict(self.__dict__) - message.pop("channel", None) + for key in NOT_SERIALIZED: + message.pop(key, None) message["created_at"] = self.created_at.isoformat() message["edited_at"] = ( self.edited_at.isoformat() if self.edited_at is not None else None From 0e4ed0eb6b30c0bdd3f350f7a023d1f6fb93a676 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 19:07:43 +0200 Subject: [PATCH 17/20] only fetch history of given time --- src/logs/channel_logs.py | 84 ++++++++++++++++++++++++++++------------ src/logs/guild_logs.py | 26 ++++++++++--- src/scanners/scanner.py | 52 ++++++++++++++----------- src/utils/utils.py | 4 +- 4 files changed, 111 insertions(+), 55 deletions(-) diff --git a/src/logs/channel_logs.py b/src/logs/channel_logs.py index 39d854b..9e83b2b 100644 --- a/src/logs/channel_logs.py +++ b/src/logs/channel_logs.py @@ -1,6 +1,7 @@ from typing import Union, Tuple, Any import discord from discord import message +from datetime import datetime from . import MessageLog from utils import FakeMessage @@ -18,6 +19,7 @@ class ChannelLogs: self.id = channel.id self.name = channel.name self.last_message_id = None + self.first_message_id = None self.format = FORMAT self.messages = [] self.start_date = None @@ -32,6 +34,12 @@ class ChannelLogs: if channel["last_message_id"] is not None else None ) + self.first_message_id = ( + int(channel["first_message_id"]) + if "first_message_id" in channel + and channel["first_message_id"] is not None + else None + ) self.messages = [ MessageLog(message, self) for message in channel["messages"] ] @@ -42,48 +50,74 @@ class ChannelLogs: def is_format(self): return self.format == FORMAT - async def load(self, channel: discord.TextChannel) -> Tuple[int, int]: + async def load( + self, channel: discord.TextChannel, start_date: datetime, stop_date: datetime + ) -> Tuple[int, int]: self.name = channel.name self.channel = channel + is_empty = self.last_message_id is None try: - if self.last_message_id is not None: # append + if is_empty: + sanity_check = len(await channel.history(limit=1).flatten()) + if sanity_check != 1: + yield len(self.messages), True + return + # load backward + if is_empty or ( + start_date is not None + and self.start_date > start_date + and self.first_message_id is not None + ): + first_message_id = self.first_message_id + first_message_date = None + tmp_message_id = 0 + done = 0 + while ( + done >= CHUNK_SIZE + or first_message_id is None + or (first_message_date is None or first_message_date >= start_date) + and start_date is not None + ) and tmp_message_id != first_message_id: + tmp_message_id = first_message_id + done = 0 + async for message in channel.history( + limit=CHUNK_SIZE, + before=FakeMessage(first_message_id) + if first_message_id is not None + else None, + oldest_first=False, + ): + done += 1 + first_message_id = message.id + first_message_date = message.created_at + m = MessageLog(message, self) + await m.load(message) + self.messages += [m] + yield len(self.messages), False + if done >= CHUNK_SIZE and first_message_date < start_date: + # date was limiting here, store first message id + self.first_message_id = first_message_id + self.last_message_id = channel.last_message_id + # load forward + if not is_empty: tmp_message_id = None + last_message_date = self.messages[0].created_at while ( self.last_message_id != channel.last_message_id - and self.last_message_id != tmp_message_id - ): + or (stop_date is not None and last_message_date <= stop_date) + ) and self.last_message_id != tmp_message_id: tmp_message_id = self.last_message_id async for message in channel.history( limit=CHUNK_SIZE, after=FakeMessage(self.last_message_id), oldest_first=True, ): + last_message_date = message.created_at self.last_message_id = message.id m = MessageLog(message, self) await m.load(message) self.messages.insert(0, m) yield len(self.messages), False - else: # first load - last_message_id = None - done = 0 - sanity_check = len(await channel.history(limit=1).flatten()) - if sanity_check == 1: - while done >= CHUNK_SIZE or last_message_id is None: - done = 0 - async for message in channel.history( - limit=CHUNK_SIZE, - before=FakeMessage(last_message_id) - if last_message_id is not None - else None, - oldest_first=False, - ): - done += 1 - last_message_id = message.id - m = MessageLog(message, self) - await m.load(message) - self.messages += [m] - yield len(self.messages), False - self.last_message_id = channel.last_message_id except discord.errors.HTTPException: yield -1, True return # When an exception occurs (like Forbidden) diff --git a/src/logs/guild_logs.py b/src/logs/guild_logs.py index 9aa0d1a..6ba20cf 100644 --- a/src/logs/guild_logs.py +++ b/src/logs/guild_logs.py @@ -32,7 +32,13 @@ MAX_MODIFICATION_TIME = 365 * 24 * 60 * 60 class Worker: - def __init__(self, channel_log: ChannelLogs, channel: discord.TextChannel): + def __init__( + self, + channel_log: ChannelLogs, + channel: discord.TextChannel, + start_date: datetime, + stop_date: datetime, + ): self.channel_log = channel_log self.channel = channel self.start_msg = len(channel_log.messages) @@ -41,12 +47,16 @@ class Worker: self.done = False self.cancelled = False self.loop = asyncio.get_event_loop() + self.start_date = start_date + self.stop_date = stop_date def start(self): asyncio.run_coroutine_threadsafe(self.process(), self.loop) async def process(self): - async for count, done in self.channel_log.load(self.channel): + async for count, done in self.channel_log.load( + self.channel, self.start_date, self.stop_date + ): if count > 0: self.queried_msg = count - self.start_msg self.total_msg = count @@ -98,7 +108,9 @@ class GuildLogs: async def load( self, progress: discord.Message, - target_channels: List[discord.TextChannel] = [], + target_channels: List[discord.TextChannel], + start_date: datetime, + stop_date: datetime, *, fast: bool, fresh: bool, @@ -173,6 +185,8 @@ class GuildLogs: if ( not fast and not fresh + and start_date is None + and stop_date is None and last_time is not None and (time.time() - last_time) < MIN_MODIFICATION_TIME ): @@ -214,7 +228,9 @@ class GuildLogs: if channel.id not in self.channels or fresh: loading_new += 1 self.channels[channel.id] = ChannelLogs(channel, self) - workers += [Worker(self.channels[channel.id], channel)] + workers += [ + Worker(self.channels[channel.id], channel, start_date, stop_date) + ] warning_msg = "(this might take a while)" if len(target_channels) > 5 and loading_new > 5: warning_msg = "(most channels are new, this will take a long while)" @@ -255,7 +271,7 @@ class GuildLogs: f"Reading new history...\n{total_msg:,} messages in {total_chan:,}/{max_chan:,} channels ({round(queried_msg/deltas(t0)):,}m/s)\n{warning_msg}{remaining_msg}", ) logging.info( - f"log {self.guild.id} > queried in {delta(t0):,}ms -> {queried_msg / deltas(t0):,.3f} m/s" + f"log {self.guild.id} > queried {queried_msg} in {delta(t0):,}ms -> {queried_msg / deltas(t0):,.3f} m/s" ) # write logs real_total_msg = sum( diff --git a/src/scanners/scanner.py b/src/scanners/scanner.py index f30cb11..62ba168 100644 --- a/src/scanners/scanner.py +++ b/src/scanners/scanner.py @@ -90,10 +90,10 @@ class Scanner(ABC): ) return - self.start_datetime = None if len(dates) < 1 else min(dates) - self.stop_datetime = datetime.now() if len(dates) < 2 else max(dates) + self.start_date = None if len(dates) < 1 else min(dates) + self.stop_date = None if len(dates) < 2 else max(dates) - if self.start_datetime is not None and self.start_datetime > datetime.now(): + if self.start_date is not None and self.start_date > datetime.now(): await message.channel.send( f"Start date is after today", reference=message ) @@ -130,20 +130,13 @@ class Scanner(ABC): allowed_mentions=discord.AllowedMentions.none(), ) total_msg, total_chan = await logs.load( - progress, self.channels, fast="fast" in args, fresh="fresh" in args + progress, + self.channels, + self.start_date, + self.stop_date, + fast="fast" in args, + fresh="fresh" in args, ) - if self.start_datetime is not None: - self.start_datetime = max( - self.start_datetime, - min( - [ - logs.channels[channel.id].start_date - for channel in self.channels - if channel.id in logs.channels - and logs.channels[channel.id].start_date is not None - ] - ), - ) if total_msg == CANCELLED: await message.channel.send( "Operation cancelled by user", @@ -157,6 +150,21 @@ class Scanner(ABC): elif total_msg == NO_FILE: await message.channel.send(gdpr.TEXT) else: + if self.start_date is not None: + self.start_date = max( + self.start_date, + min( + [ + logs.channels[channel.id].start_date + for channel in self.channels + if channel.id in logs.channels + and logs.channels[channel.id].start_date is not None + ] + ), + ) + if self.stop_date is None: + self.stop_date = datetime.utcnow() + self.msg_count = 0 self.total_msg = 0 self.chan_count = 0 @@ -169,12 +177,12 @@ class Scanner(ABC): self.compute_message(channel_logs, message_log) for message_log in channel_logs.messages if ( - self.start_datetime is None - or message_log.created_at >= self.start_datetime + self.start_date is None + or message_log.created_at >= self.start_date ) and ( - self.stop_datetime is None - or message_log.created_at <= self.stop_datetime + self.stop_date is None + or message_log.created_at <= self.stop_date ) ] ) @@ -199,8 +207,8 @@ class Scanner(ABC): self.members, self.msg_count, self.chan_count, - self.start_datetime, - self.stop_datetime, + self.start_date, + self.stop_date, ) ) logging.info(f"scan {guild.id} > results in {delta(t0):,}ms") diff --git a/src/utils/utils.py b/src/utils/utils.py index 729d633..a439ffd 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -180,9 +180,7 @@ def parse_iso_datetime(str_date: str) -> datetime: return dateutil.parser.parse(str_date) -RELATIVE_REGEX = ( - r"(yesterday|today|\d*h(ours?)?|\d*d(ays?)?|\d*w(eeks?)?|\d*m(onths?)?|\d*y(ears?))" -) +RELATIVE_REGEX = r"(yesterday|today|\d*h(ours?)?|\d*d(ays?)?|\d*w(eeks?)?|\d*m(onths?)?|\d*y(ears?)?)" def parse_relative_time(src: str) -> datetime: From 715a5985130ef774e74ff8464bd4211ac9107416 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 19:11:30 +0200 Subject: [PATCH 18/20] fix cancelled bug --- src/logs/guild_logs.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/logs/guild_logs.py b/src/logs/guild_logs.py index 6ba20cf..3e5f748 100644 --- a/src/logs/guild_logs.py +++ b/src/logs/guild_logs.py @@ -89,21 +89,22 @@ class GuildLogs: return self.locked and self.log_file not in current_analysis def lock(self) -> bool: - self.locked = True current_analysis_lock.acquire() if self.log_file in current_analysis: current_analysis_lock.release() return False + self.locked = True current_analysis.append(self.log_file) current_analysis_lock.release() return True def unlock(self): - self.locked = False - current_analysis_lock.acquire() - if self.log_file in current_analysis: - current_analysis.remove(self.log_file) - current_analysis_lock.release() + if self.locked: + self.locked = False + current_analysis_lock.acquire() + if self.log_file in current_analysis: + current_analysis.remove(self.log_file) + current_analysis_lock.release() async def load( self, From cf6fa7ccf2aed65908054f0868dca5725902bac3 Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 19:49:34 +0200 Subject: [PATCH 19/20] smol fix --- src/data_types/frequency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_types/frequency.py b/src/data_types/frequency.py index b0f00ec..aab30cf 100644 --- a/src/data_types/frequency.py +++ b/src/data_types/frequency.py @@ -49,7 +49,7 @@ class Frequency: if ( self.dates[0].weekday() <= busiest_weekday and self.dates[-1].weekday() >= busiest_weekday - ): + ) or n_weekdays == 0: n_weekdays += 1 n_hours = delta.days if self.dates[0].hour <= busiest_hour and self.dates[-1].hour >= busiest_hour: From 76af4661ed8ea80b0d0662b3bb34da5f7bfb780e Mon Sep 17 00:00:00 2001 From: Klemek Date: Fri, 9 Apr 2021 19:50:12 +0200 Subject: [PATCH 20/20] fixed time range loading --- src/logs/channel_logs.py | 39 ++++++++++++++++++++------------------- src/logs/guild_logs.py | 2 ++ src/scanners/scanner.py | 2 +- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/logs/channel_logs.py b/src/logs/channel_logs.py index 9e83b2b..a5e3857 100644 --- a/src/logs/channel_logs.py +++ b/src/logs/channel_logs.py @@ -64,47 +64,48 @@ class ChannelLogs: return # load backward if is_empty or ( - start_date is not None - and self.start_date > start_date - and self.first_message_id is not None + self.first_message_id is not None + and ( + start_date is None + or (self.start_date is not None and self.start_date > start_date) + ) ): - first_message_id = self.first_message_id first_message_date = None tmp_message_id = 0 done = 0 while ( - done >= CHUNK_SIZE - or first_message_id is None - or (first_message_date is None or first_message_date >= start_date) - and start_date is not None - ) and tmp_message_id != first_message_id: - tmp_message_id = first_message_id + first_message_date is None + or ( + done >= CHUNK_SIZE + and (start_date is None or first_message_date > start_date) + ) + ) and tmp_message_id != self.first_message_id: + tmp_message_id = self.first_message_id done = 0 async for message in channel.history( limit=CHUNK_SIZE, - before=FakeMessage(first_message_id) - if first_message_id is not None + before=FakeMessage(self.first_message_id) + if self.first_message_id is not None else None, oldest_first=False, ): done += 1 - first_message_id = message.id + self.first_message_id = message.id first_message_date = message.created_at m = MessageLog(message, self) await m.load(message) self.messages += [m] yield len(self.messages), False - if done >= CHUNK_SIZE and first_message_date < start_date: - # date was limiting here, store first message id - self.first_message_id = first_message_id + if done < CHUNK_SIZE: # reached bottom + self.first_message_id = None self.last_message_id = channel.last_message_id # load forward - if not is_empty: + last_message_date = self.messages[0].created_at + if not is_empty and (stop_date is None or last_message_date < stop_date): tmp_message_id = None - last_message_date = self.messages[0].created_at while ( self.last_message_id != channel.last_message_id - or (stop_date is not None and last_message_date <= stop_date) + and (stop_date is None or last_message_date < stop_date) ) and self.last_message_id != tmp_message_id: tmp_message_id = self.last_message_id async for message in channel.history( diff --git a/src/logs/guild_logs.py b/src/logs/guild_logs.py index 3e5f748..7600077 100644 --- a/src/logs/guild_logs.py +++ b/src/logs/guild_logs.py @@ -195,8 +195,10 @@ class GuildLogs: channel for channel in target_channels if channel.id not in self.channels + or self.channels[channel.id].first_message_id is not None ] if len(invalid_target_channels) == 0: + logging.info(f"log {self.guild.id} > assumed fast") fast = True if self.locked: self.unlock() diff --git a/src/scanners/scanner.py b/src/scanners/scanner.py index 62ba168..b4a96f1 100644 --- a/src/scanners/scanner.py +++ b/src/scanners/scanner.py @@ -150,7 +150,7 @@ class Scanner(ABC): elif total_msg == NO_FILE: await message.channel.send(gdpr.TEXT) else: - if self.start_date is not None: + if self.start_date is not None and len(logs.channels) > 0: self.start_date = max( self.start_date, min(