%words improvement

2021-04-09 00:40:28 +02:00
parent 8cc0e1fe65
commit 04f681dba6
3 changed files with 16 additions and 19 deletions
@@ -43,7 +43,7 @@
 * %react - rank users by their reactions
  * arguments:
    * <n> - top <n> messages, default is 10
-* %words - rank words by their usage
+* %words - (BETA) rank words by their usage
  * arguments:
    * <n> - words containings <n> or more letters, default is 3
    * <n2> - top <n2> words, default is 10
@@ -104,6 +104,8 @@ python3 src/main.py

 ## Changelog

+* **v1.13**
+  * improved scan `%words`
 * **v1.12**
  * more scans: `%words`
  * concurrent `fast` analysis
@@ -45,6 +45,12 @@ bot.register_command(
    "cancel: stop current analysis (not launched with fast)",
    "```\n" + "%cancel: Stop current analysis (not launched with fast)\n" + "```",
 )
+bot.register_command(
+    "words",
+    lambda *args: WordsScanner().compute(*args),
+    "words: (BETA) rank words by their usage",
+    WordsScanner.help(),
+)
 bot.register_command(
    "last",
    lambda *args: LastScanner().compute(*args),
@@ -63,12 +69,6 @@ bot.register_command(
    "first: read first message",
    FirstScanner.help(),
 )
-bot.register_command(
-    "words",
-    lambda *args: WordsScanner().compute(*args),
-    "words: rank words by their usage",
-    WordsScanner.help(),
-)
 bot.register_command(
    "mentioned",
    lambda *args: MentionedScanner().compute(*args),
@@ -20,7 +20,7 @@ class WordsScanner(Scanner):
    def help() -> str:
        return (
            "```\n"
-            + "%words: Rank words by their usage\n"
+            + "%words: (BETA) Rank words by their usage\n"
            + "arguments:\n"
            + COMMON_HELP_ARGS
            + "* <n> - words containings <n> or more letters, default is 3\n"
@@ -104,16 +104,13 @@ class WordsScanner(Scanner):
            or message.author in raw_members
        ):
            impacted = True
-            content = " ".join(
-                [
-                    block
-                    for block in message.content.split()
-                    if not re.match(r"^\w+:\/\/", block)
-                ]
-            )
+            content = message.content
+            content = re.sub(r"```.+```", "", content, flags=re.DOTALL)
+            content = re.sub(r"`.+`", "", content, flags=re.DOTALL)
+            content = re.sub(r"\w+:\/\/[^ ]+", "", content)
            for word in re.split("[^\w\-':]", content):
                m = re.match(
-                    r"(?!^:\w+:$)^[^\w]*((?![\d_])\w.*(?![\d_])\w)[^\w]*$", word
+                    r"(?!^:\w+:$)^[^\w]*((?![\d_])\w[\w\-']*(?![\d_])\w)[^\w]*$", word
                )
                if m:
                    word = m[1].lower()
@@ -126,7 +123,5 @@ class WordsScanner(Scanner):
                                words[word] = words[word + case]
                                del words[word + case]
                                break
-                        words[word].update_use(
-                            message.content.count(word), message.created_at
-                        )
+                        words[word].update_use(1, message.created_at)
        return impacted