diff --git a/.gitignore b/.gitignore index bb73726..c9e2809 100755 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,4 @@ __pycache__ .env error_* *.log -logs/ +/logs/ diff --git a/src/logs/__init__.py b/src/logs/__init__.py new file mode 100644 index 0000000..dbed38a --- /dev/null +++ b/src/logs/__init__.py @@ -0,0 +1,3 @@ +from .message_log import MessageLog +from .channel_logs import ChannelLogs +from .guild_logs import GuildLogs diff --git a/src/logs/channel_logs.py b/src/logs/channel_logs.py new file mode 100644 index 0000000..1e7548a --- /dev/null +++ b/src/logs/channel_logs.py @@ -0,0 +1,74 @@ +from typing import Union, Tuple +import discord + +from . import MessageLog +from utils import FakeMessage + +CHUNK_SIZE = 1000 +FORMAT = 3 + + +class ChannelLogs: + def __init__(self, channel: Union[discord.TextChannel, dict]): + if isinstance(channel, discord.TextChannel): + self.id = channel.id + self.name = channel.name + self.last_message_id = None + self.format = FORMAT + self.messages = [] + elif isinstance(channel, dict): + self.format = channel["format"] if "format" in channel else None + if not self.is_format(): + return + self.id = int(channel["id"]) + self.name = channel["name"] + self.last_message_id = int(channel["last_message_id"]) + self.messages = [MessageLog(message) for message in channel["messages"]] + + def is_format(self): + return self.format == FORMAT + + async def load(self, channel: discord.TextChannel) -> Tuple[int, int]: + self.name = channel.name + self.channel = channel + try: + if self.last_message_id is not None: # append + while self.last_message_id != channel.last_message_id: + async for message in channel.history( + limit=CHUNK_SIZE, + after=FakeMessage(self.last_message_id), + oldest_first=True, + ): + self.last_message_id = message.id + m = MessageLog(message) + await m.load(message) + self.messages.insert(0, m) + yield len(self.messages), False + else: # first load + last_message_id = None + done = 0 + while done >= CHUNK_SIZE or last_message_id is None: + done = 0 + async for message in channel.history( + limit=CHUNK_SIZE, + before=FakeMessage(last_message_id) + if last_message_id is not None + else None, + oldest_first=False, + ): + done += 1 + last_message_id = message.id + m = MessageLog(message) + await m.load(message) + self.messages += [m] + yield len(self.messages), False + self.last_message_id = channel.last_message_id + except discord.errors.HTTPException: + return # When an exception occurs (like Forbidden) + yield len(self.messages), True + + def dict(self) -> dict: + channel = dict(self.__dict__) + channel.pop("channel", None) + channel["messages"] = [message.dict() for message in self.messages] + return channel diff --git a/src/logs/guild_logs.py b/src/logs/guild_logs.py new file mode 100644 index 0000000..f630c79 --- /dev/null +++ b/src/logs/guild_logs.py @@ -0,0 +1,132 @@ +from typing import List, Tuple +import os +import discord +import json +import gzip +from datetime import datetime +import logging + + +from . import ChannelLogs +from utils import code_message + + +LOG_DIR = "logs" + +current_analysis = [] + + +class GuildLogs: + def __init__(self, guild: discord.Guild): + self.guild = guild + self.log_file = os.path.join(LOG_DIR, f"{guild.id}.logz") + self.channels = {} + + def dict(self) -> dict: + return {id: self.channels[id].dict() for id in self.channels} + + async def load( + self, progress: discord.Message, target_channels: List[discord.TextChannel] = [] + ) -> Tuple[int, int]: + global current_analysis + if self.log_file in current_analysis: + return -1, -1 + current_analysis += [self.log_file] + # read logs + t0 = datetime.now() + if not os.path.exists(LOG_DIR): + os.mkdir(LOG_DIR) + if os.path.exists(self.log_file): + channels = {} + try: + gziped_data = None + await code_message(progress, "Reading saved history (1/4)...") + with open(self.log_file, mode="rb") as f: + gziped_data = f.read() + await code_message(progress, "Reading saved history (2/4)...") + json_data = gzip.decompress(gziped_data) + await code_message(progress, "Reading saved history (3/4)...") + channels = json.loads(json_data) + await code_message(progress, "Reading saved history (4/4)...") + self.channels = {int(id): ChannelLogs(channels[id]) for id in channels} + # remove invalid format + self.channels = { + id: self.channels[id] + for id in self.channels + if self.channels[id].is_format() + } + dt = (datetime.now() - t0).total_seconds() + logging.info(f"log {self.guild.id} > loaded in {dt} s") + except json.decoder.JSONDecodeError: + logging.error(f"log {self.guild.id} > invalid JSON") + except IOError: + logging.error(f"log {self.guild.id} > cannot read") + # load channels + t0 = datetime.now() + if len(target_channels) == 0: + target_channels = self.guild.text_channels + loading_new = 0 + total_msg = 0 + queried_msg = 0 + total_chan = 0 + max_chan = len(target_channels) + await code_message( + progress, + f"Reading history...\n0 messages in 0/{max_chan} channels\n(this might take a while)", + ) + for channel in target_channels: + if channel.id not in self.channels: + loading_new += 1 + self.channels[channel.id] = ChannelLogs(channel) + start_msg = len(self.channels[channel.id].messages) + async for count, done in self.channels[channel.id].load(channel): + if count > 0: + tmp_queried_msg = queried_msg + count - start_msg + tmp_msg = total_msg + count + warning_msg = "(this might take a while)" + if len(target_channels) > 5 and loading_new > 5: + warning_msg = ( + "(most channels are new, this might take a looong while)" + ) + elif loading_new > 0: + warning_msg = ( + "(some channels are new, this might take a long while)" + ) + dt = (datetime.now() - t0).total_seconds() + await code_message( + progress, + f"Reading history...\n{tmp_msg:,} messages in {total_chan + 1}/{max_chan} channels ({round(tmp_queried_msg/dt)}m/s)\n{warning_msg}", + ) + if done: + total_chan += 1 + total_msg += len(self.channels[channel.id].messages) + queried_msg += count - start_msg + dt = (datetime.now() - t0).total_seconds() + logging.info( + f"log {self.guild.id} > queried in {dt} s -> {queried_msg / dt} m/s" + ) + # write logs + t0 = datetime.now() + await code_message( + progress, + f"Saving (1/3)...\n{total_msg:,} messages in {total_chan} channels", + ) + json_data = bytes(json.dumps(self.dict()), "utf-8") + await code_message( + progress, + f"Saving (2/3)...\n{total_msg:,} messages in {total_chan} channels", + ) + gziped_data = gzip.compress(json_data) + await code_message( + progress, + f"Saving (3/3)...\n{total_msg:,} messages in {total_chan} channels", + ) + with open(self.log_file, mode="wb") as f: + f.write(gziped_data) + dt = (datetime.now() - t0).total_seconds() + logging.info(f"log {self.guild.id} > written in {dt} s") + await code_message( + progress, f"Analysing...\n{total_msg:,} messages in {total_chan} channels" + ) + current_analysis.remove(self.log_file) + return total_msg, total_chan diff --git a/src/logs/message_log.py b/src/logs/message_log.py new file mode 100644 index 0000000..21e24c6 --- /dev/null +++ b/src/logs/message_log.py @@ -0,0 +1,75 @@ +from typing import Union +import discord +from datetime import datetime + +from utils import is_extension + +IMAGE_FORMAT = ["gif", "gifv", "png", "jpg", "jpeg", "bmp"] +EMBED_IMAGES = ["image", "gifv"] + + +class MessageLog: + def __init__(self, message: Union[discord.Message, dict]): + if isinstance(message, discord.Message): + self.id = message.id + self.created_at = message.created_at + self.edited_at = message.edited_at + self.author = message.author.id + self.pinned = message.pinned + self.mention_everyone = message.mention_everyone + self.tts = message.tts + self.reference = ( + message.reference.message_id if message.reference is not None else None + ) + self.bot = message.author.bot or message.author.system + self.content = message.content + self.mentions = message.raw_mentions + self.role_mentions = message.raw_role_mentions + self.channel_mentions = message.raw_channel_mentions + self.image = False + for attachment in message.attachments: + if is_extension(attachment.filename, IMAGE_FORMAT): + self.image = True + break + if not self.image: + for embed in message.embeds: + if embed.type in EMBED_IMAGES: + self.image = True + break + self.reactions = {} + elif isinstance(message, dict): + self.id = int(message["id"]) + self.created_at = datetime.fromisoformat(message["created_at"]) + self.edited_at = ( + datetime.fromisoformat(message["edited_at"]) + if message["edited_at"] is not None + else None + ) + self.author = int(message["author"]) + self.pinned = message["pinned"] + self.mention_everyone = message["mention_everyone"] + self.tts = message["tts"] + self.reference = ( + int(message["reference"]) if message["reference"] is not None else None + ) + self.bot = message["bot"] + self.content = message["content"] + self.mentions = [int(m) for m in message["mentions"]] + self.role_mentions = [int(m) for m in message["role_mentions"]] + self.channel_mentions = [int(m) for m in message["channel_mentions"]] + self.image = message["image"] + self.reactions = message["reactions"] + + async def load(self, message: discord.Message): + for reaction in message.reactions: + self.reactions[str(reaction.emoji)] = [] + async for user in reaction.users(): + self.reactions[str(reaction.emoji)] += [user.id] + + def dict(self) -> dict: + message = dict(self.__dict__) + message["created_at"] = self.created_at.isoformat() + message["edited_at"] = ( + self.edited_at.isoformat() if self.edited_at is not None else None + ) + return message diff --git a/src/scanners/emotes_scanner.py b/src/scanners/emotes_scanner.py index 633872a..670ea66 100644 --- a/src/scanners/emotes_scanner.py +++ b/src/scanners/emotes_scanner.py @@ -4,7 +4,7 @@ import discord # Custom libs -from utils.log_manager import ChannelLogs, MessageLog +from logs import ChannelLogs, MessageLog from data_types import Emote from .scanner import Scanner from utils import emojis diff --git a/src/scanners/frequency_scanner.py b/src/scanners/frequency_scanner.py index adcddf3..0cb1ee5 100644 --- a/src/scanners/frequency_scanner.py +++ b/src/scanners/frequency_scanner.py @@ -5,7 +5,7 @@ import discord # Custom libs from .scanner import Scanner -from utils.log_manager import ChannelLogs, MessageLog +from logs import ChannelLogs, MessageLog class FrequencyScanner(Scanner): diff --git a/src/scanners/scanner.py b/src/scanners/scanner.py index 792907c..31a4135 100644 --- a/src/scanners/scanner.py +++ b/src/scanners/scanner.py @@ -4,7 +4,7 @@ from collections import defaultdict import discord from utils import no_duplicate, get_intro -from utils.log_manager import GuildLogs, ChannelLogs, MessageLog +from logs import GuildLogs, ChannelLogs, MessageLog from data_types import Emote diff --git a/src/utils/emojis.py b/src/utils/emojis.py index 611b55e..2554a49 100644 --- a/src/utils/emojis.py +++ b/src/utils/emojis.py @@ -2,7 +2,7 @@ import re import json import logging -from .utils import get_resource_path +from . import get_resource_path EXTRA_EMOJI = { "thumbup": "1f44d", diff --git a/src/utils/log_manager.py b/src/utils/log_manager.py deleted file mode 100644 index c3c126b..0000000 --- a/src/utils/log_manager.py +++ /dev/null @@ -1,271 +0,0 @@ -from typing import Union, List, Tuple -import os -import discord -import json -import gzip -from datetime import datetime -import logging - -from .utils import code_message, is_extension - -LOG_DIR = "logs" - -if not os.path.exists(LOG_DIR): - os.mkdir(LOG_DIR) - - -CHUNK_SIZE = 1000 -FORMAT = 3 -IMAGE_FORMAT = ["gif", "gifv", "png", "jpg", "jpeg", "bmp"] -EMBED_IMAGES = ["image", "gifv"] - -current_analysis = [] - - -class FakeMessage: - def __init__(self, id: int): - self.id = id - - -class MessageLog: - def __init__(self, message: Union[discord.Message, dict]): - if isinstance(message, discord.Message): - self.id = message.id - self.created_at = message.created_at - self.edited_at = message.edited_at - self.author = message.author.id - self.pinned = message.pinned - self.mention_everyone = message.mention_everyone - self.tts = message.tts - self.reference = ( - message.reference.message_id if message.reference is not None else None - ) - self.bot = message.author.bot or message.author.system - self.content = message.content - self.mentions = message.raw_mentions - self.role_mentions = message.raw_role_mentions - self.channel_mentions = message.raw_channel_mentions - self.image = False - for attachment in message.attachments: - if is_extension(attachment.filename, IMAGE_FORMAT): - self.image = True - break - if not self.image: - for embed in message.embeds: - if embed.type in EMBED_IMAGES: - self.image = True - break - self.reactions = {} - elif isinstance(message, dict): - self.id = int(message["id"]) - self.created_at = datetime.fromisoformat(message["created_at"]) - self.edited_at = ( - datetime.fromisoformat(message["edited_at"]) - if message["edited_at"] is not None - else None - ) - self.author = int(message["author"]) - self.pinned = message["pinned"] - self.mention_everyone = message["mention_everyone"] - self.tts = message["tts"] - self.reference = ( - int(message["reference"]) if message["reference"] is not None else None - ) - self.bot = message["bot"] - self.content = message["content"] - self.mentions = [int(m) for m in message["mentions"]] - self.role_mentions = [int(m) for m in message["role_mentions"]] - self.channel_mentions = [int(m) for m in message["channel_mentions"]] - self.image = message["image"] - self.reactions = message["reactions"] - - async def load(self, message: discord.Message): - for reaction in message.reactions: - self.reactions[str(reaction.emoji)] = [] - async for user in reaction.users(): - self.reactions[str(reaction.emoji)] += [user.id] - - def dict(self) -> dict: - message = dict(self.__dict__) - message["created_at"] = self.created_at.isoformat() - message["edited_at"] = ( - self.edited_at.isoformat() if self.edited_at is not None else None - ) - return message - - -class ChannelLogs: - def __init__(self, channel: Union[discord.TextChannel, dict]): - if isinstance(channel, discord.TextChannel): - self.id = channel.id - self.name = channel.name - self.last_message_id = None - self.format = FORMAT - self.messages = [] - elif isinstance(channel, dict): - self.format = channel["format"] if "format" in channel else None - if self.format != FORMAT: - return - self.id = int(channel["id"]) - self.name = channel["name"] - self.last_message_id = int(channel["last_message_id"]) - self.messages = [MessageLog(message) for message in channel["messages"]] - - async def load(self, channel: discord.TextChannel) -> Tuple[int, int]: - self.name = channel.name - self.channel = channel - try: - if self.last_message_id is not None: # append - while self.last_message_id != channel.last_message_id: - async for message in channel.history( - limit=CHUNK_SIZE, - after=FakeMessage(self.last_message_id), - oldest_first=True, - ): - self.last_message_id = message.id - m = MessageLog(message) - await m.load(message) - self.messages.insert(0, m) - yield len(self.messages), False - else: # first load - last_message_id = None - done = 0 - while done >= CHUNK_SIZE or last_message_id is None: - done = 0 - async for message in channel.history( - limit=CHUNK_SIZE, - before=FakeMessage(last_message_id) - if last_message_id is not None - else None, - oldest_first=False, - ): - done += 1 - last_message_id = message.id - m = MessageLog(message) - await m.load(message) - self.messages += [m] - yield len(self.messages), False - self.last_message_id = channel.last_message_id - except discord.errors.HTTPException: - return # When an exception occurs (like Forbidden) - yield len(self.messages), True - - def dict(self) -> dict: - channel = dict(self.__dict__) - channel.pop("channel", None) - channel["messages"] = [message.dict() for message in self.messages] - return channel - - -class GuildLogs: - def __init__(self, guild: discord.Guild): - self.guild = guild - self.log_file = os.path.join(LOG_DIR, f"{guild.id}.logz") - self.channels = {} - - def dict(self) -> dict: - return {id: self.channels[id].dict() for id in self.channels} - - async def load( - self, progress: discord.Message, target_channels: List[discord.TextChannel] = [] - ) -> Tuple[int, int]: - global current_analysis - if self.log_file in current_analysis: - return -1, -1 - current_analysis += [self.log_file] - # read logs - t0 = datetime.now() - if os.path.exists(self.log_file): - channels = {} - try: - gziped_data = None - await code_message(progress, "Reading saved history (1/4)...") - with open(self.log_file, mode="rb") as f: - gziped_data = f.read() - await code_message(progress, "Reading saved history (2/4)...") - json_data = gzip.decompress(gziped_data) - await code_message(progress, "Reading saved history (3/4)...") - channels = json.loads(json_data) - await code_message(progress, "Reading saved history (4/4)...") - self.channels = {int(id): ChannelLogs(channels[id]) for id in channels} - # remove invalid format - self.channels = { - id: self.channels[id] - for id in self.channels - if self.channels[id].format == FORMAT - } - dt = (datetime.now() - t0).total_seconds() - logging.info(f"log {self.guild.id} > loaded in {dt} s") - except json.decoder.JSONDecodeError: - logging.error(f"log {self.guild.id} > invalid JSON") - except IOError: - logging.error(f"log {self.guild.id} > cannot read") - # load channels - t0 = datetime.now() - if len(target_channels) == 0: - target_channels = self.guild.text_channels - loading_new = 0 - total_msg = 0 - queried_msg = 0 - total_chan = 0 - max_chan = len(target_channels) - await code_message( - progress, - f"Reading history...\n0 messages in 0/{max_chan} channels\n(this might take a while)", - ) - for channel in target_channels: - if channel.id not in self.channels: - loading_new += 1 - self.channels[channel.id] = ChannelLogs(channel) - start_msg = len(self.channels[channel.id].messages) - async for count, done in self.channels[channel.id].load(channel): - if count > 0: - tmp_queried_msg = queried_msg + count - start_msg - tmp_msg = total_msg + count - warning_msg = "(this might take a while)" - if len(target_channels) > 5 and loading_new > 5: - warning_msg = ( - "(most channels are new, this might take a looong while)" - ) - elif loading_new > 0: - warning_msg = ( - "(some channels are new, this might take a long while)" - ) - dt = (datetime.now() - t0).total_seconds() - await code_message( - progress, - f"Reading history...\n{tmp_msg:,} messages in {total_chan + 1}/{max_chan} channels ({round(tmp_queried_msg/dt)}m/s)\n{warning_msg}", - ) - if done: - total_chan += 1 - total_msg += len(self.channels[channel.id].messages) - queried_msg += count - start_msg - dt = (datetime.now() - t0).total_seconds() - logging.info( - f"log {self.guild.id} > queried in {dt} s -> {queried_msg / dt} m/s" - ) - # write logs - t0 = datetime.now() - await code_message( - progress, - f"Saving (1/3)...\n{total_msg:,} messages in {total_chan} channels", - ) - json_data = bytes(json.dumps(self.dict()), "utf-8") - await code_message( - progress, - f"Saving (2/3)...\n{total_msg:,} messages in {total_chan} channels", - ) - gziped_data = gzip.compress(json_data) - await code_message( - progress, - f"Saving (3/3)...\n{total_msg:,} messages in {total_chan} channels", - ) - with open(self.log_file, mode="wb") as f: - f.write(gziped_data) - dt = (datetime.now() - t0).total_seconds() - logging.info(f"log {self.guild.id} > written in {dt} s") - await code_message( - progress, f"Analysing...\n{total_msg:,} messages in {total_chan} channels" - ) - current_analysis.remove(self.log_file) - return total_msg, total_chan diff --git a/src/utils/utils.py b/src/utils/utils.py index b3ccd29..591b637 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -18,6 +18,11 @@ def mention(member_id: int) -> str: return f"<@{member_id}>" +class FakeMessage: + def __init__(self, id: int): + self.id = id + + # FILE