diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 0000000..5029c28 --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,26 @@ +name: Python + +on: ["push", "pull_request"] + +jobs: + syntax: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.7, 3.8, 3.9] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics \ No newline at end of file diff --git a/README.md b/README.md index 881d143..ad239ac 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ * %first - read first message * %rand - read a random message * %last - read last message +* %gdpr - displays GDPR information * %emojis - rank emotes by their usage * arguments: * - top emojis, default is 20 @@ -43,7 +44,7 @@ * %react - rank users by their reactions * arguments: * - top messages, default is 10 -* %words - rank words by their usage +* %words - (BETA) rank words by their usage * arguments: * - words containings or more letters, default is 3 * - top words, default is 10 @@ -52,9 +53,13 @@ * Common arguments: * @member/me: filter for one or more member * #channel/here: filter for one or more channel + * - filter after + * - filter before * all/everyone - include bots messages * fast: only read cache * fresh: does not read cache + +(Sample dates: 2020 / 2021-11 / 2021-06-28 / 2020-06-28T23:00 / today / week / 8days / 1y) ``` ## Running this bot @@ -104,6 +109,12 @@ python3 src/main.py ## Changelog +* **v1.13** + * improved scan `%words` + * remove old and unused logs at start and guild leaving + * GDPR disclaimer before scanning + * start and stop dates + * bug fix and improvements * **v1.12** * more scans: `%words` * concurrent `fast` analysis diff --git a/requirements.txt b/requirements.txt index 95a454b..7bc9d08 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ discord.py python-dotenv +python-dateutil git+git://github.com/Klemek/miniscord.git diff --git a/src/data_types/composition.py b/src/data_types/composition.py index e2c0a1c..69364a1 100644 --- a/src/data_types/composition.py +++ b/src/data_types/composition.py @@ -23,49 +23,45 @@ class Composition: self.spoilers = 0 def to_string(self, msg_count: int) -> List[str]: - ret = [] - ret += [ - f"- **avg. characters / message**: {self.total_characters/msg_count:.2f}" - ] - if self.plain_text > 0: - ret += [ - f"- **plain text messages**: {self.plain_text:,} ({percent(self.plain_text/msg_count)})" - ] - if self.edited > 0: - ret += [ - f"- **edited messages**: {self.edited:,} ({percent(self.edited/msg_count)})" - ] - if self.everyone > 0: - ret += [ - f"- **@\u200beveryone**: {self.everyone:,} ({percent(self.everyone/msg_count)})" - ] - if self.mentions > 0: - ret += [ - f"- **mentions**: {self.mentions:,} (in {percent(self.mention_msg/msg_count)} of msg, avg. {precise(self.mentions/msg_count)}/msg)", - ] - if self.answers > 0: - ret += [ - f"- **answers**: {self.answers:,} ({percent(self.answers/msg_count)})" - ] total_emotes = val_sum(self.emotes) - if total_emotes > 0: - top_emote = top_key(self.emotes) - ret += [ - f"- **emojis**: {total_emotes:,} (in {percent(self.emote_msg/msg_count)} of msg, avg. {precise(total_emotes/msg_count)}/msg)", - f"- **most used emoji**: {top_emote} ({plural(self.emotes[top_emote], 'time')}, {percent(self.emotes[top_emote]/total_emotes)})", - ] - if self.emote_only > 0: - ret += [ - f"- **emoji-only messages**: {self.emote_only:,} ({percent(self.emote_only/msg_count)})" - ] - if self.images > 0: - ret += [f"- **images**: {self.images:,} ({percent(self.images/msg_count)})"] - if self.links > 0: - ret += [f"- **links**: {self.links:,} ({percent(self.link_msg/msg_count)})"] - if self.spoilers > 0: - ret += [ - f"- **spoilers**: {self.spoilers:,} ({percent(self.spoilers/msg_count)})" - ] - if self.tts > 0: - ret += [f"- **tts messages**: {self.tts:,} ({percent(self.tts/msg_count)})"] + top_emote = top_key(self.emotes) + ret = [ + f"- **avg. characters / message**: {self.total_characters/msg_count:.2f}", + f"- **plain text messages**: {self.plain_text:,} ({percent(self.plain_text/msg_count)})" + if self.plain_text > 0 + else "", + f"- **edited messages**: {self.edited:,} ({percent(self.edited/msg_count)})" + if self.edited > 0 + else "", + f"- **@\u200beveryone**: {self.everyone:,} ({percent(self.everyone/msg_count)})" + if self.everyone > 0 + else "", + f"- **mentions**: {self.mentions:,} (in {percent(self.mention_msg/msg_count)} of msg, avg. {precise(self.mentions/msg_count)}/msg)" + if self.mentions > 0 + else "", + f"- **answers**: {self.answers:,} ({percent(self.answers/msg_count)})" + if self.answers > 0 + else "", + f"- **emojis**: {total_emotes:,} (in {percent(self.emote_msg/msg_count)} of msg, avg. {precise(total_emotes/msg_count)}/msg)" + if total_emotes > 0 + else "", + f"- **most used emoji**: {top_emote} ({plural(self.emotes[top_emote], 'time')}, {percent(self.emotes[top_emote]/total_emotes)})" + if total_emotes > 0 + else "", + f"- **emoji-only messages**: {self.emote_only:,} ({percent(self.emote_only/msg_count)})" + if self.emote_only > 0 + else "", + f"- **images**: {self.images:,} ({percent(self.images/msg_count)})" + if self.images > 0 + else "", + f"- **links**: {self.links:,} ({percent(self.link_msg/msg_count)})" + if self.links > 0 + else "", + f"- **spoilers**: {self.spoilers:,} ({percent(self.spoilers/msg_count)})" + if self.spoilers > 0 + else "", + f"- **tts messages**: {self.tts:,} ({percent(self.tts/msg_count)})" + if self.tts > 0 + else "", + ] return ret diff --git a/src/data_types/frequency.py b/src/data_types/frequency.py index 14cf5dd..aab30cf 100644 --- a/src/data_types/frequency.py +++ b/src/data_types/frequency.py @@ -38,7 +38,10 @@ class Frequency: *, member_specific: bool, ) -> List[str]: + self.dates.sort() delta = self.dates[-1] - self.dates[0] + if delta.days == 0: + delta = timedelta(days=1) total_msg = len(self.dates) busiest_weekday = top_key(self.week) busiest_hour = top_key(self.day) @@ -46,7 +49,7 @@ class Frequency: if ( self.dates[0].weekday() <= busiest_weekday and self.dates[-1].weekday() >= busiest_weekday - ): + ) or n_weekdays == 0: n_weekdays += 1 n_hours = delta.days if self.dates[0].hour <= busiest_hour and self.dates[-1].hour >= busiest_hour: @@ -56,19 +59,16 @@ class Frequency: f"- **latest message**: {str_datetime(self.dates[-1])} ({from_now(self.dates[-1])})", f"- **messages/day**: {precise(total_msg/delta.days, precision=3)}", f"- **busiest day of week**: {calendar.day_name[busiest_weekday]} (~{precise(self.week[busiest_weekday]/n_weekdays, precision=3)} msg, {percent(self.week[busiest_weekday]/total_msg)})", - f"- **busiest day ever**: {str_date(self.busiest_day)} ({from_now(self.busiest_day)}, {self.busiest_day_count} msg)", + f"- **busiest day ever**: {str_date(self.busiest_day)} ({from_now(self.busiest_day)}, {self.busiest_day_count} msg)" + if self.busiest_day is not None + else "", f"- **messages/hour**: {precise(total_msg*3600/delta.total_seconds(), precision=3)}", f"- **busiest hour of day**: {busiest_hour:0>2}:00 (~{precise(self.day[busiest_hour]/n_hours, precision=3)} msg, {percent(self.day[busiest_hour]/total_msg)})", f"- **busiest hour ever**: {str_datetime(self.busiest_hour)} ({from_now(self.busiest_hour)}, {self.busiest_hour_count} msg)", f"- **longest break**: {plural(round(self.longest_break.total_seconds()/3600), 'hour')} ({plural(self.longest_break.days,'day')}) from {str_datetime(self.longest_break_start)} ({from_now(self.longest_break_start)})", f"- **avg. streak**: {precise(sum(self.streaks)/len(self.streaks), precision=3)} msg", + f"- **longest streak**: {self.longest_streak:,} msg from {str_datetime(self.longest_streak_start)} ({from_now(self.longest_streak_start)})" + if member_specific + else f"- **longest streak**: {mention(self.longest_streak_author)} ({self.longest_streak:,} msg from {str_datetime(self.longest_streak_start)}, {from_now(self.longest_streak_start)})", ] - if member_specific: - ret += [ - f"- **longest streak**: {self.longest_streak:,} msg from {str_datetime(self.longest_streak_start)} ({from_now(self.longest_streak_start)})" - ] - else: - ret += [ - f"- **longest streak**: {mention(self.longest_streak_author)} ({self.longest_streak:,} msg from {str_datetime(self.longest_streak_start)}, {from_now(self.longest_streak_start)})" - ] return ret diff --git a/src/data_types/presence.py b/src/data_types/presence.py index 778881a..682774b 100644 --- a/src/data_types/presence.py +++ b/src/data_types/presence.py @@ -25,74 +25,70 @@ class Presence: show_top_channel: bool, member_specific: bool, ) -> List[str]: - ret = [] if chan_count is None: type = "server's" elif chan_count == 1: type = "channel's" else: type = "channels'" - if member_specific: - ret += [ - f"- **messages**: {msg_count:,} ({percent(msg_count/total_msg)} of {type})" - ] - else: - top_member = top_key(self.messages) - ret += [ - f"- **top messages**: {mention(top_member)} ({self.messages[top_member]:,} msg, {percent(self.messages[top_member]/val_sum(self.messages))})" - ] - if show_top_channel: - top_channel = top_key(self.channel_usage) - channel_sum = val_sum(self.channel_usage) - found_in = sorted( - self.channel_usage, - key=lambda k: self.channel_usage[k] / self.channel_total[k], - )[-1] - ret += [ - f"- **most visited channel**: {channel_mention(top_channel)} ({self.channel_usage[top_channel]:,} msg, {percent(self.channel_usage[top_channel]/channel_sum)})", - ] - if member_specific: - ret += [ - f"- **most contributed channel**: {channel_mention(found_in)} ({self.channel_usage[found_in]:,} msg, {percent(self.channel_usage[found_in]/self.channel_total[found_in])} of {type})" - ] - if member_specific: - if len(self.mentions) > 0: - top_mention = top_key(self.mentions) - mention_sum = val_sum(self.mentions) - ret += [ - f"- **was mentioned**: {plural(mention_sum, 'time')} ({percent(mention_sum/val_sum(self.mention_count))} of {type})", - f"- **mostly mentioned by**: {mention(top_mention)} ({plural(self.mentions[top_mention], 'time')}, {percent(self.mentions[top_mention]/mention_sum)})", - ] - if len(self.mention_others) > 0: - top_mention = top_key(self.mention_others) - mention_sum = val_sum(self.mention_others) - if member_specific: - ret += [ - f"- **mentioned others**: {plural(mention_sum, 'time')} ({percent(mention_sum/val_sum(self.mention_count))} of {type})", - f"- **mostly mentioned**: {mention(top_mention)} ({plural(self.mention_others[top_mention], 'time')}, {percent(self.mention_others[top_mention]/mention_sum)})", - ] - else: - top_member = top_key(self.mention_count) - ret += [ - f"- **mentioned**: {plural(mention_sum, 'time')} ({mention(top_member)}, {percent(self.mention_count[top_member]/val_sum(self.mention_count))})", - f"- **top mentions**: {mention(top_member)} ({plural(self.mention_count[top_member], 'time')}, {percent(self.mention_count[top_member]/val_sum(self.mention_count))})", - f"- **most mentioned**: {mention(top_mention)} ({plural(self.mention_others[top_mention], 'time')}, {percent(self.mention_others[top_mention]/mention_sum)})", - ] - if len(self.reactions) > 0: - total_used = val_sum(self.reactions) - top_reaction = top_key(self.reactions) - ret += [ - f"- **reactions**: {plural(total_used, 'time')}", - f"- **most used reaction**: {top_reaction} ({plural(self.reactions[top_reaction], 'time')}, {percent(self.reactions[top_reaction]/total_used)})", - ] - if member_specific: - ret[ - -2 - ] += f" ({percent(total_used/val_sum(self.used_reaction))} of {type})" - else: - top_member = top_key(self.used_reaction) - ret.insert( - -1, - f"- **top reactions**: {mention(top_member)} ({plural(self.used_reaction[top_member], 'time')}, {percent(self.used_reaction[top_member]/val_sum(self.used_reaction))})", - ) + top_member = top_key(self.messages) + top_channel = top_key(self.channel_usage) + channel_sum = val_sum(self.channel_usage) + found_in = top_key( + self.channel_usage, + key=lambda k: self.channel_usage[k] / self.channel_total[k], + ) + top_mention = top_key(self.mentions) + mention_sum = val_sum(self.mentions) + top_mention_others = top_key(self.mention_others) + mention_others_sum = val_sum(self.mention_others) + top_member_mentioned = top_key(self.mention_count) + total_reaction_used = val_sum(self.reactions) + top_reaction = top_key(self.reactions) + top_reaction_member = top_key(self.used_reaction) + + ret = [ + f"- **messages**: {msg_count:,} ({percent(msg_count/total_msg)} of {type})" + if member_specific + else f"- **top messages**: {mention(top_member)} ({self.messages[top_member]:,} msg, {percent(self.messages[top_member]/val_sum(self.messages))})", + f"- **most visited channel**: {channel_mention(top_channel)} ({self.channel_usage[top_channel]:,} msg, {percent(self.channel_usage[top_channel]/channel_sum)})" + if show_top_channel + else "", + f"- **most contributed channel**: {channel_mention(found_in)} ({self.channel_usage[found_in]:,} msg, {percent(self.channel_usage[found_in]/self.channel_total[found_in])} of {type})" + if show_top_channel and member_specific + else "", + f"- **was mentioned**: {plural(mention_sum, 'time')} ({percent(mention_sum/val_sum(self.mention_count))} of {type})" + if member_specific and len(self.mentions) > 0 + else "", + f"- **mostly mentioned by**: {mention(top_mention)} ({plural(self.mentions[top_mention], 'time')}, {percent(self.mentions[top_mention]/mention_sum)})" + if member_specific and len(self.mentions) > 0 + else "", + f"- **mentioned others**: {plural(mention_others_sum, 'time')} ({percent(mention_others_sum/val_sum(self.mention_count))} of {type})" + if len(self.mention_others) > 0 and member_specific + else "", + f"- **mostly mentioned**: {mention(top_mention_others)} ({plural(self.mention_others[top_mention_others], 'time')}, {percent(self.mention_others[top_mention_others]/mention_others_sum)})" + if len(self.mention_others) > 0 and member_specific + else "", + f"- **mentioned**: {plural(mention_others_sum, 'time')} ({mention(top_member_mentioned)}, {percent(self.mention_count[top_member_mentioned]/val_sum(self.mention_count))})" + if len(self.mention_others) > 0 and not member_specific + else "", + f"- **top mentions**: {mention(top_member_mentioned)} ({plural(self.mention_count[top_member_mentioned], 'time')}, {percent(self.mention_count[top_member_mentioned]/val_sum(self.mention_count))})" + if len(self.mention_others) > 0 and not member_specific + else "", + f"- **most mentioned**: {mention(top_mention_others)} ({plural(self.mention_others[top_mention_others], 'time')}, {percent(self.mention_others[top_mention_others]/mention_others_sum)})" + if len(self.mention_others) > 0 and not member_specific + else "", + f"- **reactions**: {plural(total_reaction_used, 'time')}" + if len(self.reactions) > 0 and not member_specific + else "", + f"- **reactions**: {plural(total_reaction_used, 'time')} ({percent(total_reaction_used/val_sum(self.used_reaction))} of {type})" + if len(self.reactions) > 0 and member_specific + else "", + f"- **top reactions**: {mention(top_reaction_member)} ({plural(self.used_reaction[top_reaction_member], 'time')}, {percent(self.used_reaction[top_reaction_member]/val_sum(self.used_reaction))})" + if len(self.reactions) > 0 and not member_specific + else "", + f"- **most used reaction**: {top_reaction} ({plural(self.reactions[top_reaction], 'time')}, {percent(self.reactions[top_reaction]/total_reaction_used)})" + if len(self.reactions) > 0 + else "", + ] return ret diff --git a/src/logs/__init__.py b/src/logs/__init__.py index 358e9af..d62ab1d 100644 --- a/src/logs/__init__.py +++ b/src/logs/__init__.py @@ -1,3 +1,3 @@ from .message_log import MessageLog from .channel_logs import ChannelLogs -from .guild_logs import GuildLogs, ALREADY_RUNNING, CANCELLED +from .guild_logs import GuildLogs, ALREADY_RUNNING, CANCELLED, NO_FILE diff --git a/src/logs/channel_logs.py b/src/logs/channel_logs.py index 86c7a28..a5e3857 100644 --- a/src/logs/channel_logs.py +++ b/src/logs/channel_logs.py @@ -1,5 +1,7 @@ from typing import Union, Tuple, Any import discord +from discord import message +from datetime import datetime from . import MessageLog from utils import FakeMessage @@ -7,6 +9,8 @@ from utils import FakeMessage CHUNK_SIZE = 2000 FORMAT = 3 +NOT_SERIALIZED = ["channel", "guild", "start_date"] + class ChannelLogs: def __init__(self, channel: Union[discord.TextChannel, dict], guild: Any): @@ -15,8 +19,10 @@ class ChannelLogs: self.id = channel.id self.name = channel.name self.last_message_id = None + self.first_message_id = None self.format = FORMAT self.messages = [] + self.start_date = None elif isinstance(channel, dict): self.format = channel["format"] if "format" in channel else None if not self.is_format(): @@ -28,63 +34,102 @@ class ChannelLogs: if channel["last_message_id"] is not None else None ) + self.first_message_id = ( + int(channel["first_message_id"]) + if "first_message_id" in channel + and channel["first_message_id"] is not None + else None + ) self.messages = [ MessageLog(message, self) for message in channel["messages"] ] + self.start_date = ( + self.messages[-1].created_at if len(self.messages) > 0 else None + ) def is_format(self): return self.format == FORMAT - async def load(self, channel: discord.TextChannel) -> Tuple[int, int]: + async def load( + self, channel: discord.TextChannel, start_date: datetime, stop_date: datetime + ) -> Tuple[int, int]: self.name = channel.name self.channel = channel + is_empty = self.last_message_id is None try: - if self.last_message_id is not None: # append + if is_empty: + sanity_check = len(await channel.history(limit=1).flatten()) + if sanity_check != 1: + yield len(self.messages), True + return + # load backward + if is_empty or ( + self.first_message_id is not None + and ( + start_date is None + or (self.start_date is not None and self.start_date > start_date) + ) + ): + first_message_date = None + tmp_message_id = 0 + done = 0 + while ( + first_message_date is None + or ( + done >= CHUNK_SIZE + and (start_date is None or first_message_date > start_date) + ) + ) and tmp_message_id != self.first_message_id: + tmp_message_id = self.first_message_id + done = 0 + async for message in channel.history( + limit=CHUNK_SIZE, + before=FakeMessage(self.first_message_id) + if self.first_message_id is not None + else None, + oldest_first=False, + ): + done += 1 + self.first_message_id = message.id + first_message_date = message.created_at + m = MessageLog(message, self) + await m.load(message) + self.messages += [m] + yield len(self.messages), False + if done < CHUNK_SIZE: # reached bottom + self.first_message_id = None + self.last_message_id = channel.last_message_id + # load forward + last_message_date = self.messages[0].created_at + if not is_empty and (stop_date is None or last_message_date < stop_date): tmp_message_id = None while ( self.last_message_id != channel.last_message_id - and self.last_message_id != tmp_message_id - ): + and (stop_date is None or last_message_date < stop_date) + ) and self.last_message_id != tmp_message_id: tmp_message_id = self.last_message_id async for message in channel.history( limit=CHUNK_SIZE, after=FakeMessage(self.last_message_id), oldest_first=True, ): + last_message_date = message.created_at self.last_message_id = message.id m = MessageLog(message, self) await m.load(message) self.messages.insert(0, m) yield len(self.messages), False - else: # first load - last_message_id = None - done = 0 - sanity_check = len(await channel.history(limit=1).flatten()) - if sanity_check == 1: - while done >= CHUNK_SIZE or last_message_id is None: - done = 0 - async for message in channel.history( - limit=CHUNK_SIZE, - before=FakeMessage(last_message_id) - if last_message_id is not None - else None, - oldest_first=False, - ): - done += 1 - last_message_id = message.id - m = MessageLog(message, self) - await m.load(message) - self.messages += [m] - yield len(self.messages), False - self.last_message_id = channel.last_message_id except discord.errors.HTTPException: yield -1, True return # When an exception occurs (like Forbidden) + self.start_date = ( + self.messages[-1].created_at if len(self.messages) > 0 else None + ) yield len(self.messages), True def dict(self) -> dict: channel = dict(self.__dict__) - channel.pop("channel", None) - channel.pop("guild", None) + for key in NOT_SERIALIZED: + channel.pop(key, None) channel["messages"] = [message.dict() for message in self.messages] return channel diff --git a/src/logs/guild_logs.py b/src/logs/guild_logs.py index 88d8823..7600077 100644 --- a/src/logs/guild_logs.py +++ b/src/logs/guild_logs.py @@ -15,6 +15,7 @@ from utils import code_message, delta, deltas LOG_DIR = "logs" +LOG_EXT = ".logz" current_analysis = [] current_analysis_lock = threading.Lock() @@ -22,12 +23,22 @@ current_analysis_lock = threading.Lock() ALREADY_RUNNING = -100 CANCELLED = -200 +NO_FILE = -300 +# 5 minutes, assume 'fast' arg MIN_MODIFICATION_TIME = 5 * 60 +# ~1 year, remove log file +MAX_MODIFICATION_TIME = 365 * 24 * 60 * 60 class Worker: - def __init__(self, channel_log: ChannelLogs, channel: discord.TextChannel): + def __init__( + self, + channel_log: ChannelLogs, + channel: discord.TextChannel, + start_date: datetime, + stop_date: datetime, + ): self.channel_log = channel_log self.channel = channel self.start_msg = len(channel_log.messages) @@ -36,12 +47,16 @@ class Worker: self.done = False self.cancelled = False self.loop = asyncio.get_event_loop() + self.start_date = start_date + self.stop_date = stop_date def start(self): asyncio.run_coroutine_threadsafe(self.process(), self.loop) async def process(self): - async for count, done in self.channel_log.load(self.channel): + async for count, done in self.channel_log.load( + self.channel, self.start_date, self.stop_date + ): if count > 0: self.queried_msg = count - self.start_msg self.total_msg = count @@ -54,7 +69,7 @@ class GuildLogs: def __init__(self, guild: discord.Guild): self.id = guild.id self.guild = guild - self.log_file = os.path.join(LOG_DIR, f"{guild.id}.logz") + self.log_file = os.path.join(LOG_DIR, f"{guild.id}{LOG_EXT}") self.channels = {} self.locked = False @@ -74,26 +89,29 @@ class GuildLogs: return self.locked and self.log_file not in current_analysis def lock(self) -> bool: - self.locked = True current_analysis_lock.acquire() if self.log_file in current_analysis: current_analysis_lock.release() return False + self.locked = True current_analysis.append(self.log_file) current_analysis_lock.release() return True def unlock(self): - self.locked = False - current_analysis_lock.acquire() - if self.log_file in current_analysis: - current_analysis.remove(self.log_file) - current_analysis_lock.release() + if self.locked: + self.locked = False + current_analysis_lock.acquire() + if self.log_file in current_analysis: + current_analysis.remove(self.log_file) + current_analysis_lock.release() async def load( self, progress: discord.Message, - target_channels: List[discord.TextChannel] = [], + target_channels: List[discord.TextChannel], + start_date: datetime, + stop_date: datetime, *, fast: bool, fresh: bool, @@ -106,52 +124,49 @@ class GuildLogs: if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) last_time = None - if os.path.exists(self.log_file): - channels = {} - try: - last_time = os.path.getmtime(self.log_file) - gziped_data = None - await code_message(progress, "Reading saved history (1/4)...") - t0 = datetime.now() - with open(self.log_file, mode="rb") as f: - gziped_data = f.read() - logging.info(f"log {self.guild.id} > read in {delta(t0):,}ms") - if self.check_cancelled(): - return CANCELLED, 0 - await code_message(progress, "Reading saved history (2/4)...") - t0 = datetime.now() - json_data = gzip.decompress(gziped_data) - del gziped_data - logging.info( - f"log {self.guild.id} > gzip decompress in {delta(t0):,}ms" - ) - if self.check_cancelled(): - return CANCELLED, 0 - await code_message(progress, "Reading saved history (3/4)...") - t0 = datetime.now() - channels = json.loads(json_data) - del json_data - logging.info(f"log {self.guild.id} > json parse in {delta(t0):,}ms") - if self.check_cancelled(): - return CANCELLED, 0 - await code_message(progress, "Reading saved history (4/4)...") - t0 = datetime.now() - self.channels = { - int(id): ChannelLogs(channels[id], self) for id in channels - } - # remove invalid format - self.channels = { - id: self.channels[id] - for id in self.channels - if self.channels[id].is_format() - } - logging.info(f"log {self.guild.id} > loaded in {delta(t0):,}ms") - except json.decoder.JSONDecodeError: - logging.error(f"log {self.guild.id} > invalid JSON") - except IOError: - logging.error(f"log {self.guild.id} > cannot read") - else: - fast = False + if not os.path.exists(self.log_file): + return NO_FILE, 0 + channels = {} + try: + last_time = os.path.getmtime(self.log_file) + gziped_data = None + await code_message(progress, "Reading saved history (1/4)...") + t0 = datetime.now() + with open(self.log_file, mode="rb") as f: + gziped_data = f.read() + logging.info(f"log {self.guild.id} > read in {delta(t0):,}ms") + if self.check_cancelled(): + return CANCELLED, 0 + await code_message(progress, "Reading saved history (2/4)...") + t0 = datetime.now() + json_data = gzip.decompress(gziped_data) + del gziped_data + logging.info(f"log {self.guild.id} > gzip decompress in {delta(t0):,}ms") + if self.check_cancelled(): + return CANCELLED, 0 + await code_message(progress, "Reading saved history (3/4)...") + t0 = datetime.now() + channels = json.loads(json_data) + del json_data + logging.info(f"log {self.guild.id} > json parse in {delta(t0):,}ms") + if self.check_cancelled(): + return CANCELLED, 0 + await code_message(progress, "Reading saved history (4/4)...") + t0 = datetime.now() + self.channels = { + int(id): ChannelLogs(channels[id], self) for id in channels + } + # remove invalid format + self.channels = { + id: self.channels[id] + for id in self.channels + if self.channels[id].is_format() + } + logging.info(f"log {self.guild.id} > loaded in {delta(t0):,}ms") + except json.decoder.JSONDecodeError: + logging.error(f"log {self.guild.id} > invalid JSON") + except IOError: + logging.error(f"log {self.guild.id} > cannot read") if len(target_channels) == 0: target_channels = ( @@ -171,6 +186,8 @@ class GuildLogs: if ( not fast and not fresh + and start_date is None + and stop_date is None and last_time is not None and (time.time() - last_time) < MIN_MODIFICATION_TIME ): @@ -178,8 +195,10 @@ class GuildLogs: channel for channel in target_channels if channel.id not in self.channels + or self.channels[channel.id].first_message_id is not None ] if len(invalid_target_channels) == 0: + logging.info(f"log {self.guild.id} > assumed fast") fast = True if self.locked: self.unlock() @@ -212,7 +231,9 @@ class GuildLogs: if channel.id not in self.channels or fresh: loading_new += 1 self.channels[channel.id] = ChannelLogs(channel, self) - workers += [Worker(self.channels[channel.id], channel)] + workers += [ + Worker(self.channels[channel.id], channel, start_date, stop_date) + ] warning_msg = "(this might take a while)" if len(target_channels) > 5 and loading_new > 5: warning_msg = "(most channels are new, this will take a long while)" @@ -253,7 +274,7 @@ class GuildLogs: f"Reading new history...\n{total_msg:,} messages in {total_chan:,}/{max_chan:,} channels ({round(queried_msg/deltas(t0)):,}m/s)\n{warning_msg}{remaining_msg}", ) logging.info( - f"log {self.guild.id} > queried in {delta(t0):,}ms -> {queried_msg / deltas(t0):,.3f} m/s" + f"log {self.guild.id} > queried {queried_msg} in {delta(t0):,}ms -> {queried_msg / deltas(t0):,.3f} m/s" ) # write logs real_total_msg = sum( @@ -322,3 +343,46 @@ class GuildLogs: f"No cancellable analysis are currently running on this server", reference=message, ) + + @staticmethod + def init_log(guild: List[discord.Guild]): + if not os.path.exists(LOG_DIR): + os.mkdir(LOG_DIR) + filename = os.path.join(LOG_DIR, f"{guild.id}{LOG_EXT}") + if not os.path.exists(filename): + with open(filename, mode="wb") as f: + f.write(gzip.compress(bytes("{}", "utf-8"))) + logging.info(f"log {guild.id} > created") + else: + logging.info(f"log {guild.id} > already exists") + + @staticmethod + def remove_log(guild: List[discord.Guild]): + if not os.path.exists(LOG_DIR): + os.mkdir(LOG_DIR) + filename = os.path.join(LOG_DIR, f"{guild.id}{LOG_EXT}") + if os.path.exists(filename): + os.unlink(filename) + logging.info(f"log {guild.id} > removed") + else: + logging.info(f"log {guild.id} > does not exists") + + @staticmethod + def check_logs(guilds: List[discord.Guild]): + logging.info(f"checking logs...") + if not os.path.exists(LOG_DIR): + os.mkdir(LOG_DIR) + guild_ids = [str(guild.id) for guild in guilds] + for item in os.listdir(LOG_DIR): + path = os.path.join(LOG_DIR, item) + name, ext = os.path.splitext(item) + if os.path.isfile(path) and ext == LOG_EXT: + if ( + name in guild_ids + and (time.time() - os.path.getmtime(path)) > MAX_MODIFICATION_TIME + ): + logging.info(f"> removing old log '{path}'") + os.unlink(path) + elif name not in guild_ids: + logging.info(f"> removing unused log '{path}'") + os.unlink(path) diff --git a/src/logs/message_log.py b/src/logs/message_log.py index f534155..263c245 100644 --- a/src/logs/message_log.py +++ b/src/logs/message_log.py @@ -8,6 +8,9 @@ IMAGE_FORMAT = [".gif", ".gifv", ".png", ".jpg", ".jpeg", ".bmp"] EMBED_IMAGES = ["image", "gifv"] +NOT_SERIALIZED = ["channel"] + + class MessageLog: def __init__(self, message: Union[discord.Message, dict], channel: Any): self.channel = channel @@ -79,7 +82,8 @@ class MessageLog: def dict(self) -> dict: message = dict(self.__dict__) - message.pop("channel", None) + for key in NOT_SERIALIZED: + message.pop(key, None) message["created_at"] = self.created_at.isoformat() message["edited_at"] = ( self.edited_at.isoformat() if self.edited_at is not None else None diff --git a/src/main.py b/src/main.py index 565e92d..d2278e5 100644 --- a/src/main.py +++ b/src/main.py @@ -6,7 +6,7 @@ if sys.version_info < (3, 7): print("Please upgrade your Python version to 3.7.0 or higher") sys.exit(1) -from utils import emojis +from utils import emojis, gdpr from scanners import ( EmotesScanner, FullScanner, @@ -33,17 +33,43 @@ emojis.load_emojis() bot = Bot( "Discord Analyst", - "1.12", + "1.13", alias="%", ) bot.log_calls = True + +async def on_ready(): + GuildLogs.check_logs(bot.client.guilds) + return True + + +async def on_guild_remove(): + GuildLogs.check_logs(bot.client.guilds) + return True + + +bot.register_event(on_ready) +bot.register_event(on_guild_remove) + bot.register_command( "(cancel|stop)", GuildLogs.cancel, "cancel: stop current analysis (not launched with fast)", - "```\n" + "%cancel: Stop current analysis (not launched with fast)\n" + "```", + "```\n%cancel: Stop current analysis (not launched with fast)\n```", +) +bot.register_command( + "gdpr", + gdpr.process, + "gdpr: displays GDPR information", + gdpr.HELP, +) +bot.register_command( + "words", + lambda *args: WordsScanner().compute(*args), + "words: (BETA) rank words by their usage", + WordsScanner.help(), ) bot.register_command( "last", @@ -63,12 +89,6 @@ bot.register_command( "first: read first message", FirstScanner.help(), ) -bot.register_command( - "words", - lambda *args: WordsScanner().compute(*args), - "words: rank words by their usage", - WordsScanner.help(), -) bot.register_command( "mentioned", lambda *args: MentionedScanner().compute(*args), diff --git a/src/scanners/__init__.py b/src/scanners/__init__.py index 21fd922..ed9141d 100644 --- a/src/scanners/__init__.py +++ b/src/scanners/__init__.py @@ -11,4 +11,4 @@ from .reactions_scanner import ReactionsScanner from .first_scanner import FirstScanner from .last_scanner import LastScanner from .random_scanner import RandomScanner -from .words_scanner import WordsScanner \ No newline at end of file +from .words_scanner import WordsScanner diff --git a/src/scanners/channels_scanner.py b/src/scanners/channels_scanner.py index b3e7763..c766fb4 100644 --- a/src/scanners/channels_scanner.py +++ b/src/scanners/channels_scanner.py @@ -8,21 +8,17 @@ import discord from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter -from utils import COMMON_HELP_ARGS, mention, channel_mention +from utils import generate_help, mention, channel_mention class ChannelsScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%chan: Rank channels by their messages\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top , default is 10\n" - + "* all/everyone - include bots\n" - + "Example: %chan 10 @user\n" - + "```" + return generate_help( + "chan", + "Rank channels by their messages", + args=[" - top , default is 10", "all/everyone - include bots"], + example="5 @user", ) def __init__(self): diff --git a/src/scanners/composition_scanner.py b/src/scanners/composition_scanner.py index daec2f7..a2f3822 100644 --- a/src/scanners/composition_scanner.py +++ b/src/scanners/composition_scanner.py @@ -8,21 +8,13 @@ import discord from .scanner import Scanner from data_types import Composition from logs import ChannelLogs, MessageLog -from utils import emojis, COMMON_HELP_ARGS +from utils import emojis, generate_help class CompositionScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%compo: Show composition statistics\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %compo #mychannel1 @user\n" - + "```" - ) + return generate_help("compo", "Show composition statistics") def __init__(self): super().__init__( diff --git a/src/scanners/emotes_scanner.py b/src/scanners/emotes_scanner.py index 8c7b93f..b126812 100644 --- a/src/scanners/emotes_scanner.py +++ b/src/scanners/emotes_scanner.py @@ -8,24 +8,23 @@ import discord from logs import ChannelLogs, MessageLog from data_types import Emote, get_emote_dict from .scanner import Scanner -from utils import emojis, COMMON_HELP_ARGS, plural, precise +from utils import emojis, generate_help, plural, precise class EmotesScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%emojis: Rank emojis by their usage\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top emojis, default is 20\n" - + "* all - list all common emojis in addition to this guild's\n" - + "* members - show top member for each emojis\n" - + "* sort:usage/reaction - other sorting methods\n" - + "* everyone - include bots\n" - + "Example: %emojis 10 all #mychannel1 #mychannel2 @user\n" - + "```" + return generate_help( + "emojis", + "Rank emojis by their usage", + args=[ + " - top emojis, default is 20", + "all - list all common emojis in addition to this guild's", + "members - show top member for each emojis", + "sort:usage/reaction - other sorting methods", + "everyone - include bots", + ], + example="10 all #mychannel1 #mychannel2 @user", ) def __init__(self): diff --git a/src/scanners/first_scanner.py b/src/scanners/first_scanner.py index 766b145..1048e2c 100644 --- a/src/scanners/first_scanner.py +++ b/src/scanners/first_scanner.py @@ -3,14 +3,13 @@ from typing import List # Custom libs from .history_scanner import HistoryScanner +from utils import generate_help class FirstScanner(HistoryScanner): @staticmethod def help() -> str: - return super(FirstScanner, FirstScanner).help( - cmd="first", text="Read first message" - ) + return generate_help("first", "Read first message") def __init__(self): super().__init__(help=FirstScanner.help()) diff --git a/src/scanners/frequency_scanner.py b/src/scanners/frequency_scanner.py index fac0a27..e0a4818 100644 --- a/src/scanners/frequency_scanner.py +++ b/src/scanners/frequency_scanner.py @@ -8,21 +8,13 @@ import discord from .scanner import Scanner from data_types import Frequency from logs import ChannelLogs, MessageLog -from utils import COMMON_HELP_ARGS +from utils import generate_help class FrequencyScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%freq: Show frequency-related statistics\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %freq #mychannel1 @user\n" - + "```" - ) + return generate_help("freq", "Show frequency-related statistics") def __init__(self): super().__init__( @@ -55,7 +47,7 @@ class FrequencyScanner(Scanner): freq: Frequency, raw_members: List[int], *, - all_messages: bool + all_messages: bool, ) -> bool: impacted = False # If author is included in the selection (empty list is all) diff --git a/src/scanners/full_scanner.py b/src/scanners/full_scanner.py index 22149bd..ac5cb0e 100644 --- a/src/scanners/full_scanner.py +++ b/src/scanners/full_scanner.py @@ -8,21 +8,13 @@ from .scanner import Scanner from . import FrequencyScanner, CompositionScanner, PresenceScanner from data_types import Frequency, Composition, Presence from logs import ChannelLogs, MessageLog -from utils import COMMON_HELP_ARGS +from utils import generate_help class FullScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%scan: Show full statistics\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %scan #mychannel1 @user\n" - + "```" - ) + return generate_help("scan", "Show full statistics") def __init__(self): super().__init__( diff --git a/src/scanners/history_scanner.py b/src/scanners/history_scanner.py index c61872e..5a3ae4c 100644 --- a/src/scanners/history_scanner.py +++ b/src/scanners/history_scanner.py @@ -7,22 +7,9 @@ import discord from .scanner import Scanner from data_types import History from logs import ChannelLogs, MessageLog -from utils import COMMON_HELP_ARGS class HistoryScanner(Scanner, ABC): - @staticmethod - def help(*, cmd: str, text: str) -> str: - return ( - "```\n" - + f"%{cmd}: {text}\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %{cmd} #mychannel1 @user\n" - + "```" - ) - def __init__(self, *, help: str): super().__init__( has_digit_args=True, diff --git a/src/scanners/last_scanner.py b/src/scanners/last_scanner.py index 7713195..3d8cbf0 100644 --- a/src/scanners/last_scanner.py +++ b/src/scanners/last_scanner.py @@ -3,14 +3,13 @@ from typing import List # Custom libs from .history_scanner import HistoryScanner +from utils import generate_help class LastScanner(HistoryScanner): @staticmethod def help() -> str: - return super(LastScanner, LastScanner).help( - cmd="last", text="Read last message" - ) + return generate_help("last", "Read last message") def __init__(self): super().__init__(help=LastScanner.help()) diff --git a/src/scanners/mentioned_scanner.py b/src/scanners/mentioned_scanner.py index 8cf74d4..fa6c09e 100644 --- a/src/scanners/mentioned_scanner.py +++ b/src/scanners/mentioned_scanner.py @@ -8,22 +8,18 @@ import discord from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter -from utils import COMMON_HELP_ARGS, plural, precise, mention, alt_mention +from utils import generate_help, plural, precise, mention, alt_mention class MentionedScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%mentioned: Rank specific user's mentions by their usage\n" - + "arguments:\n" - + "* @member/me - (required) one or more member\n" - + "\n".join(COMMON_HELP_ARGS.split("\n")[1:]) - + "* - top mentions, default is 10\n" - + "* all - include bots mentions\n" - + "Example: %mentioned 10 @user\n" - + "```" + return generate_help( + "mentioned", + "Rank specific user's mentions by their usage", + args=[" - top , default is 10", "all/everyone - include bots"], + example="5 @user", + replace_args=[" @member/me - (required) one or more member"], ) def __init__(self): @@ -45,7 +41,7 @@ class MentionedScanner(Scanner): "You need to mention at least one member or use `me`", reference=message ) return False - self.all_mentions = "all" in args + self.all_mentions = "all" in args or "everyone" in args # Create mentions dict self.mentions = defaultdict(Counter) return True diff --git a/src/scanners/mentions_scanner.py b/src/scanners/mentions_scanner.py index 8890390..50a0f5c 100644 --- a/src/scanners/mentions_scanner.py +++ b/src/scanners/mentions_scanner.py @@ -9,7 +9,7 @@ from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter from utils import ( - COMMON_HELP_ARGS, + generate_help, plural, precise, mention, @@ -22,16 +22,15 @@ from utils import ( class MentionsScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%mentions: Rank mentions by their usage\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top mentions, default is 10\n" - + "* all - show role/channel/everyone/here mentions\n" - + "* everyone - include bots mentions\n" - + "Example: %mentions 10 #mychannel1 #mychannel2 @user\n" - + "```" + return generate_help( + "mentions", + "Rank mentions by their usage", + args=[ + " - top , default is 10", + "all - show role/channel/everyone/here mentions", + "everyone - include bots mentions", + ], + example="10 #mychannel1 #mychannel2 @user", ) def __init__(self): diff --git a/src/scanners/messages_scanner.py b/src/scanners/messages_scanner.py index f576057..a79735e 100644 --- a/src/scanners/messages_scanner.py +++ b/src/scanners/messages_scanner.py @@ -8,21 +8,17 @@ import discord from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter -from utils import COMMON_HELP_ARGS, mention, channel_mention +from utils import generate_help, mention, channel_mention class MessagesScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%msg: Rank users by their messages\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top , default is 10\n" - + "* all/everyone - include bots\n" - + "Example: %msg 10 #channel\n" - + "```" + return generate_help( + "msg", + "Rank users by their messages", + args=[" - top , default is 10", "all/everyone - include bots"], + example="10 #channel", ) def __init__(self): diff --git a/src/scanners/presence_scanner.py b/src/scanners/presence_scanner.py index b19e723..5e39931 100644 --- a/src/scanners/presence_scanner.py +++ b/src/scanners/presence_scanner.py @@ -7,21 +7,13 @@ import discord from .scanner import Scanner from data_types import Presence from logs import ChannelLogs, MessageLog -from utils import COMMON_HELP_ARGS +from utils import generate_help class PresenceScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%pres: Show presence statistics\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* all/everyone - include bots\n" - + "Example: %pres #mychannel1 @user\n" - + "```" - ) + return generate_help("pres", "Show presence statistics") def __init__(self): super().__init__( diff --git a/src/scanners/random_scanner.py b/src/scanners/random_scanner.py index 9ef520b..f4fb7a9 100644 --- a/src/scanners/random_scanner.py +++ b/src/scanners/random_scanner.py @@ -3,14 +3,13 @@ from typing import List # Custom libs from .history_scanner import HistoryScanner +from utils import generate_help class RandomScanner(HistoryScanner): @staticmethod def help() -> str: - return super(RandomScanner, RandomScanner).help( - cmd="rand", text="Read a random message" - ) + return generate_help("rand", "Read a random message") def __init__(self): super().__init__(help=RandomScanner.help()) diff --git a/src/scanners/reactions_scanner.py b/src/scanners/reactions_scanner.py index bb84387..3603a06 100644 --- a/src/scanners/reactions_scanner.py +++ b/src/scanners/reactions_scanner.py @@ -8,20 +8,17 @@ import discord from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter -from utils import COMMON_HELP_ARGS, mention, channel_mention +from utils import generate_help, mention, channel_mention class ReactionsScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%react: Rank users by their reactions\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - top , default is 10\n" - + "Example: %react 10 #channel\n" - + "```" + return generate_help( + "react", + "Rank users by their reactions", + args=[" - top , default is 10"], + example="10 #channel", ) def __init__(self): diff --git a/src/scanners/scanner.py b/src/scanners/scanner.py index 9a7e712..b4a96f1 100644 --- a/src/scanners/scanner.py +++ b/src/scanners/scanner.py @@ -5,8 +5,24 @@ import logging import re import discord -from utils import no_duplicate, get_intro, delta -from logs import GuildLogs, ChannelLogs, MessageLog, ALREADY_RUNNING, CANCELLED + +from utils import ( + no_duplicate, + get_intro, + delta, + gdpr, + ISO8601_REGEX, + RELATIVE_REGEX, + parse_time, +) +from logs import ( + GuildLogs, + ChannelLogs, + MessageLog, + ALREADY_RUNNING, + CANCELLED, + NO_FILE, +) class Scanner(ABC): @@ -47,22 +63,42 @@ class Scanner(ABC): str(channel.id) for channel in message.channel_mentions ] str_mentions = [str(member.id) for member in message.mentions] + dates = [] for i, arg in enumerate(args[1:]): + skip_check = False if re.match(r"^<@!?\d+>$", arg): arg = arg[3:-1] if "!" in arg else arg[2:-1] elif re.match(r"^<#!?\d+>$", arg): arg = arg[3:-1] if "!" in arg else arg[2:-1] + elif re.match(ISO8601_REGEX, arg) or re.match(RELATIVE_REGEX, arg): + dates += [parse_time(arg)] + skip_check = True + if len(dates) > 2: + await message.channel.send( + f"Too many date arguments: `{arg}`", reference=message + ) + return if ( arg not in self.valid_args + ["me", "here", "fast", "fresh"] and (not arg.isdigit() or not self.has_digit_args) and arg not in str_channel_mentions and arg not in str_mentions + and not skip_check ): await message.channel.send( f"Unrecognized argument: `{arg}`", reference=message ) return + self.start_date = None if len(dates) < 1 else min(dates) + self.stop_date = None if len(dates) < 2 else max(dates) + + if self.start_date is not None and self.start_date > datetime.now(): + await message.channel.send( + f"Start date is after today", reference=message + ) + return + # Get selected channels or all of them if no channel arguments self.channels = no_duplicate(message.channel_mentions) @@ -94,7 +130,12 @@ class Scanner(ABC): allowed_mentions=discord.AllowedMentions.none(), ) total_msg, total_chan = await logs.load( - progress, self.channels, fast="fast" in args, fresh="fresh" in args + progress, + self.channels, + self.start_date, + self.stop_date, + fast="fast" in args, + fresh="fresh" in args, ) if total_msg == CANCELLED: await message.channel.send( @@ -106,7 +147,24 @@ class Scanner(ABC): "An analysis is already running on this server, please be patient.", reference=message, ) + elif total_msg == NO_FILE: + await message.channel.send(gdpr.TEXT) else: + if self.start_date is not None and len(logs.channels) > 0: + self.start_date = max( + self.start_date, + min( + [ + logs.channels[channel.id].start_date + for channel in self.channels + if channel.id in logs.channels + and logs.channels[channel.id].start_date is not None + ] + ), + ) + if self.stop_date is None: + self.stop_date = datetime.utcnow() + self.msg_count = 0 self.total_msg = 0 self.chan_count = 0 @@ -118,13 +176,21 @@ class Scanner(ABC): [ self.compute_message(channel_logs, message_log) for message_log in channel_logs.messages + if ( + self.start_date is None + or message_log.created_at >= self.start_date + ) + and ( + self.stop_date is None + or message_log.created_at <= self.stop_date + ) ] ) self.total_msg += len(channel_logs.messages) self.msg_count += count self.chan_count += 1 if count > 0 else 0 logging.info(f"scan {guild.id} > scanned in {delta(t0):,}ms") - if self.total_msg == 0: + if self.msg_count == 0: await message.channel.send( "There are no messages found matching the filters", reference=message, @@ -141,21 +207,24 @@ class Scanner(ABC): self.members, self.msg_count, self.chan_count, + self.start_date, + self.stop_date, ) ) logging.info(f"scan {guild.id} > results in {delta(t0):,}ms") response = "" first = True for r in results: - if len(response + "\n" + r) > 2000: - await message.channel.send( - response, - reference=message if first else None, - allowed_mentions=discord.AllowedMentions.none(), - ) - first = False - response = "" - response += "\n" + r + if r: + if len(response + "\n" + r) > 2000: + await message.channel.send( + response, + reference=message if first else None, + allowed_mentions=discord.AllowedMentions.none(), + ) + first = False + response = "" + response += "\n" + r if len(response) > 0: await message.channel.send( response, diff --git a/src/scanners/words_scanner.py b/src/scanners/words_scanner.py index 80d0971..f7f6dd7 100644 --- a/src/scanners/words_scanner.py +++ b/src/scanners/words_scanner.py @@ -9,7 +9,7 @@ from logs import ChannelLogs, MessageLog from .scanner import Scanner from data_types import Counter from utils import ( - COMMON_HELP_ARGS, + generate_help, plural, precise, ) @@ -18,16 +18,15 @@ from utils import ( class WordsScanner(Scanner): @staticmethod def help() -> str: - return ( - "```\n" - + "%words: Rank words by their usage\n" - + "arguments:\n" - + COMMON_HELP_ARGS - + "* - words containings or more letters, default is 3\n" - + "* - top words, default is 10\n" - + "* everyone - include bots\n" - + "Example: %words 5 10 #mychannel1 #mychannel2 @user\n" - + "```" + return generate_help( + "words", + "(BETA) Rank words by their usage", + args=[ + " - words containings or more letters, default is 3", + " - top words, default is 10", + "all/everyone - include bots", + ], + example="5 10 #mychannel1 #mychannel2 @user", ) def __init__(self): @@ -104,16 +103,13 @@ class WordsScanner(Scanner): or message.author in raw_members ): impacted = True - content = " ".join( - [ - block - for block in message.content.split() - if not re.match(r"^\w+:\/\/", block) - ] - ) + content = message.content + content = re.sub(r"```.+```", "", content, flags=re.DOTALL) + content = re.sub(r"`.+`", "", content, flags=re.DOTALL) + content = re.sub(r"\w+:\/\/[^ ]+", "", content) for word in re.split("[^\w\-':]", content): m = re.match( - r"(?!^:\w+:$)^[^\w]*((?![\d_])\w.*(?![\d_])\w)[^\w]*$", word + r"(?!^:\w+:$)^[^\w]*((?![\d_])\w[\w\-']*(?![\d_])\w)[^\w]*$", word ) if m: word = m[1].lower() @@ -126,7 +122,5 @@ class WordsScanner(Scanner): words[word] = words[word + case] del words[word + case] break - words[word].update_use( - message.content.count(word), message.created_at - ) + words[word].update_use(1, message.created_at) return impacted diff --git a/src/utils/gdpr.py b/src/utils/gdpr.py new file mode 100644 index 0000000..5ae85c6 --- /dev/null +++ b/src/utils/gdpr.py @@ -0,0 +1,65 @@ +import discord + +from logs import GuildLogs + + +HELP = """``` +%gdpr: Displays GDPR information +arguments: +* agree - agree to GDPR +* revoke - remove this server's data +```""" + +TEXT = """ +__**About Analyst-bot's data usage**__ +**TL;DR** +Analyst-bot collects text message information. It does not share collected data with any third-party and data is retained 18 months or until the bot is leaving the guild/server. +**Data collection** +Analyst-bot collects a Discord guild/server's history when asked to. +This includes: +- Visible text channel names +- Visible text messages: date and time of creation and edition, author, content, reactions and other available metadata (pinned, tts, etc.) +This does __not__ includes: +- Voice channels and not visible channels +- Not visible text messages +- Visible text messages' embedded content, images and other attachments +**Data processing** +Any data collected is only processed in order to produce a one-time report sent to the user immediately. No temporary data are retained. +**Data storage and retain policy** +Analyst-bot stores the collected data in files that are accessible by the software and its administrator only. +Any collected data are retained maximum 18 months until deletion or when the bot is leaving a guild/server. +**Data sharing** +Analyst-bot does not share the data collected with any third-party. +**Right to retract** +If you want to have your data removed, you can use the `%gdpr revoke` command or remove this bot from your guild/server. +**Terms agreement** +By agreeing to these terms, you ensure having the legal age if you are in a country that does have one and you also ensure having the consent of every member involved. + +*If you want more information, please contact the creator of this bot: .* + +Type `%gdpr agree` to agree to these terms, `%gdpr revoke` to remove this guild/server's collected data or `%gdpr` to see this message again. +""" + +AGREE_TEXT = "Thanks for agreeing for these terms, you can now run analysis on this guild/server." + +REVOKE_TEXT = "This guild/server's data has been deleted. To run new analysis you must agree to the terms again." + + +async def process(client: discord.client, message: discord.Message, *args: str): + args = list(args) + if len(args) == 1: + await message.channel.send(TEXT) + elif len(args) > 2: + await message.channel.send(f"Too many arguments", reference=message) + elif args[1] == "help": + await message.channel.send(HELP, reference=message) + elif args[1] in ["agree", "accept"]: + GuildLogs.init_log(message.channel.guild) + await message.channel.send(AGREE_TEXT, reference=message) + elif args[1] in ["revoke", "cancel", "remove", "delete"]: + GuildLogs.remove_log(message.channel.guild) + await message.channel.send(REVOKE_TEXT, reference=message) + else: + await message.channel.send( + f"Unrecognized argument: `{args[1]}`", reference=message + ) diff --git a/src/utils/utils.py b/src/utils/utils.py index 880d892..a439ffd 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -1,19 +1,44 @@ -from typing import List, Dict, Union, Optional, Any +from calendar import month +from typing import Callable, List, Dict, Union, Optional, Any import os import logging import discord import math -from datetime import datetime +from datetime import datetime, timedelta +import re +import dateutil.parser +from dateutil.relativedelta import relativedelta # OTHER -COMMON_HELP_ARGS = ( - "" - + "* @member/me - filter for one or more member\n" - + "* #channel/here - filter for one or more channel\n" - + "* fast - only read cache\n" - + "* fresh - does not read cache (long)\n" -) +COMMON_HELP_ARGS = [ + "@member/me - filter for one or more member", + "#channel/here - filter for one or more channel", + " - filter after ", + " - filter before ", + "fast - only read cache", + "fresh - does not read cache (long)", +] + + +def generate_help( + cmd: str, + info: str, + *, + args=["all/everyone - include bots"], + example="#mychannel1 @user", + replace_args=[], +): + arg_list = "* " + "\n* ".join( + replace_args + COMMON_HELP_ARGS[len(replace_args) :] + args + ) + return f"""``` +%{cmd}: {info} +arguments: +{arg_list} +(Sample dates: 2020 / 2021-11 / 2021-06-28 / 2020-06-28T23:00 / today / week / 8days / 1y) +Example: %{cmd} {example} +```""" def delta(t0: datetime): @@ -92,11 +117,19 @@ def no_duplicate(seq: list) -> list: # DICTS -def top_key(d: Dict[Union[str, int], int]) -> Union[str, int]: - return sorted(d, key=lambda k: d[k])[-1] +def top_key( + d: Dict[Union[str, int], int], key: Optional[Callable] = None +) -> Union[str, int]: + if len(d) == 0: + return None + if key is None: + key = lambda k: d[k] + return sorted(d, key=key)[-1] def val_sum(d: Dict[Any, int]) -> int: + if len(d) == 0: + return 0 return sum(d.values()) @@ -135,6 +168,51 @@ def precise(p: float, *, precision: int = 2) -> str: # DATE FORMATTING +ISO8601_REGEX = r"^([\+-]?\d{4}(?!\d{2}\b))((-?)((0[1-9]|1[0-2])(\3([12]\d|0[1-9]|3[01]))?|W([0-4]\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\d|[12]\d{2}|3([0-5]\d|6[1-6])))([T\s]((([01]\d|2[0-3])((:?)[0-5]\d)?|24\:?00)([\.,]\d+(?!:))?)?(\17[0-5]\d([\.,]\d+)?)?([zZ]|([\+-])([01]\d|2[0-3]):?([0-5]\d)?)?)?)?$" +ISO8601_FULL = "0000-01-01T00:00:00" + + +def parse_iso_datetime(str_date: str) -> datetime: + if re.match( + "^\d{4}(-\d{2}(-\d{2}(T\d{2}(:\d{2}(:\d{2}(:\d{2})?)?)?)?)?)?$", str_date + ): + str_date = str_date + "0000-01-01T00:00:00"[len(str_date) :] + return dateutil.parser.parse(str_date) + + +RELATIVE_REGEX = r"(yesterday|today|\d*h(ours?)?|\d*d(ays?)?|\d*w(eeks?)?|\d*m(onths?)?|\d*y(ears?)?)" + + +def parse_relative_time(src: str) -> datetime: + timezone_delta = datetime.utcnow() - datetime.now() + if src == "today": + return datetime.today() + timezone_delta + elif src == "yesterday": + return datetime.today() - relativedelta(days=1) + timezone_delta + else: + m = re.match("(\d*)(\w+)", src) + delta = None + value = int(m[1]) if m[1] else 1 + unit = m[2][0] + if unit == "h": + delta = relativedelta(hours=value) + elif unit == "d": + delta = relativedelta(days=value) + elif unit == "w": + delta = relativedelta(weeks=value) + elif unit == "m": + delta = relativedelta(months=value) + elif unit == "y": + delta = relativedelta(years=value) + return datetime.utcnow() - delta + + +def parse_time(src: str) -> datetime: + if re.match(RELATIVE_REGEX, src): + return parse_relative_time(src) + else: + return parse_iso_datetime(src) + def str_date(date: datetime) -> str: return date.strftime("%d %b. %Y") # 12 Jun. 2018 @@ -144,29 +222,37 @@ def str_datetime(date: datetime) -> str: return date.strftime("%H:%M, %d %b. %Y") # 12:05, 12 Jun. 2018 -def from_now(src: Optional[datetime]) -> str: - if src is None: - return "never" - delay = datetime.utcnow() - src +def str_delta(delay: timedelta) -> str: seconds = delay.seconds minutes = seconds // 60 hours = minutes // 60 if delay.days < 1: if hours < 1: if minutes == 0: - return "now" + return "no time" elif minutes == 1: - return "a minute ago" + return "a minute" else: - return f"{minutes} minutes ago" + return f"{minutes} minutes" elif hours == 1: - return "an hour ago" + return "an hour" else: - return f"{hours} hours ago" + return f"{hours} hours" elif delay.days == 1: - return "yesterday" + return "one day" else: - return f"{delay.days:,} days ago" + return f"{delay.days:,} days" + + +def from_now(src: Optional[datetime]) -> str: + if src is None: + return "never" + output = str_delta(datetime.utcnow() - src) + if output == "no time": + return "now" + elif output == "one day": + return "yesterday" + return output + " ago" # APP SPECIFIC @@ -179,46 +265,48 @@ def get_intro( members: List[discord.Member], nmm: int, # number of messages impacted nc: int, # number of impacted channels + start_datetime: datetime, + stop_datetime: datetime, ) -> str: """ Get the introduction sentence of the response """ + time_text = "" + if start_datetime is not None: + stop_datetime = datetime.now() if stop_datetime is None else stop_datetime + time_text = f" (in {str_delta(stop_datetime - start_datetime)})" # Show all data (members, channels) when it's less than 5 units if len(members) == 0: # Full scan of the server if full: - return f"{subject} in this server ({nc} channels, {nmm:,} messages):" + return f"{subject} in this server ({nc} channels, {nmm:,} messages){time_text}:" elif len(channels) < 5: - return f"{aggregate([c.mention for c in channels])} {subject.lower()} in {nmm:,} messages:" + return f"{aggregate([c.mention for c in channels])} {subject.lower()} in {nmm:,} messages{time_text}:" else: - return ( - f"These {len(channels)} channels {subject.lower()} in {nmm:,} messages:" - ) + return f"These {len(channels)} channels {subject.lower()} in {nmm:,} messages{time_text}:" elif len(members) < 5: if full: - return f"{aggregate([m.mention for m in members])} {subject.lower()} in {nmm:,} messages:" + return f"{aggregate([m.mention for m in members])} {subject.lower()} in {nmm:,} messages{time_text}:" elif len(channels) < 5: return ( f"{aggregate([m.mention for m in members])} on {aggregate([c.mention for c in channels])} " - f"{subject.lower()} in {nmm:,} messages:" + f"{subject.lower()} in {nmm:,} messages{time_text}:" ) else: return ( f"{aggregate([m.mention for m in members])} on these {len(channels)} channels " - f"{subject.lower()} in {nmm:,} messages:" + f"{subject.lower()} in {nmm:,} messages{time_text}:" ) else: if full: - return ( - f"These {len(members)} members {subject.lower()} in {nmm:,} messages:" - ) + return f"These {len(members)} members {subject.lower()} in {nmm:,} messages{time_text}:" elif len(channels) < 5: return ( f"These {len(members)} members on {aggregate([c.mention for c in channels])} " - f"{subject.lower()} in {nmm:,} messages:" + f"{subject.lower()} in {nmm:,} messages{time_text}:" ) else: return ( f"These {len(members)} members on these {len(channels)} channels " - f"{subject.lower()} in {nmm:,} messages:" + f"{subject.lower()} in {nmm:,} messages{time_text}:" )