From 140e472e292f861220e53e51439fdfc59cc33984 Mon Sep 17 00:00:00 2001 From: Klemek Date: Sun, 4 Apr 2021 21:56:39 +0200 Subject: [PATCH] bot detector base code --- .gitignore | 1 + src/bot_detector.js | 62 +++++++++++++++++++++++++++++++++++++++++ src/config.default.json | 4 +++ 3 files changed, 67 insertions(+) create mode 100644 src/bot_detector.js diff --git a/.gitignore b/.gitignore index 32b7f43..03d4669 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /node_modules /config.json /config.example.json +/robots_list.json /data /data/* /test_data diff --git a/src/bot_detector.js b/src/bot_detector.js new file mode 100644 index 0000000..4291a7e --- /dev/null +++ b/src/bot_detector.js @@ -0,0 +1,62 @@ +const https = require('https'); +const fs = require('fs'); + +module.exports = (config) => { + const _this = { + status: { + FETCH_OK: 1, + FETCH_ERROR: 2, + READ_OK: 3, + READ_ERROR: 4, + }, + count: [], + regex: null, + knownBots: [], + known: [], + }; + + const fetchList = (cb) => { + https.get(config['robots']['list_url'], (res) => { + const file = fs.createWriteStream(config['robots']['list_file']); + res.pipe(file); + file.on('finish', () => { + file.close(cb); + }); + }).on('error', (err) => { + cb(err.message); + }); + }; + + const readFile = (cb) => { + fs.readFile(config['robots']['list_file'], (err, data) => { + if (err) { + cb(err, undefined); + } else { + try { + cb(undefined, JSON.parse(data)); + } catch (err) { + cb(err, undefined); + } + } + }); + }; + + _this.load = (cb) => { + fetchList((err) => { + cb(err ? _this.status.FETCH_ERROR : _this.status.FETCH_OK, err); + readFile((err, data) => { + _this.count = data.length; + _this.regex = new RegExp('(' + data.filter(v => v['pattern']).map(v => v['pattern']) + .join('|') + ')'); + cb(err ? _this.status.READ_ERROR : _this.status.READ_OK, err); + }); + }); + }; + + _this.handle = (req, res, next) => { + req.isRobot = !!((req.headers['user-agent'] || '').match(_this.regex)); + next(); + }; + + return _this; +}; diff --git a/src/config.default.json b/src/config.default.json index 49565f9..cc69c16 100644 --- a/src/config.default.json +++ b/src/config.default.json @@ -63,6 +63,10 @@ "hit_counter": { "unique_visitor_timeout": 7200000 }, + "robots": { + "list_url": "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json", + "list_file": "robots_list.json" + }, "redis": { "host": "localhost", "port": 6379