diff --git a/.gitignore b/.gitignore index 32b7f43..03d4669 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /node_modules /config.json /config.example.json +/robots_list.json /data /data/* /test_data diff --git a/README.md b/README.md index b990175..ad6f0c2 100644 --- a/README.md +++ b/README.md @@ -336,6 +336,11 @@ Any URL like `/year/month/day/anything/` will redirect to this article (and link * `hit_counter` * `unique_visitor_timeout`: (default: 7200000 / 2h) specify the time (in ms) before a visitor can be accounted again +* `robots` + * `list_url`: (default: https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json) + url to fetch for web crawlers patterns + * `list_file`: (default: robots_list.json) + file to store web crawlers patterns * `redis` Options to connect to redis (see [redis options](https://github.com/NodeRedis/node-redis#options-object-properties) for more info) * `host`: (default: localhost) diff --git a/package-lock.json b/package-lock.json index 9414459..a854cac 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,11 +1,11 @@ { "name": "gitblog.md", - "version": "1.2.8", + "version": "1.3.2", "lockfileVersion": 2, "requires": true, "packages": { "": { - "version": "1.2.8", + "version": "1.3.2", "hasInstallScript": true, "license": "ISC", "dependencies": { diff --git a/package.json b/package.json index 5d63228..9bb1ef0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gitblog.md", - "version": "1.3.1", + "version": "1.3.2", "description": "A static blog using Markdown pulled from your git repository.", "main": "src/server.js", "dependencies": { diff --git a/src/app.js b/src/app.js index 3364ce2..70eb174 100644 --- a/src/app.js +++ b/src/app.js @@ -61,6 +61,23 @@ module.exports = (config) => { } }, ); + const botDetector = require('./bot_detector')(config); + botDetector.load((status, err) => { + switch (status) { + case botDetector.status.FETCH_OK: + console.log(cons.ok, 'fetched robots list'); + break; + case botDetector.status.FETCH_ERROR: + console.error(cons.error, 'error fetching robots list : ' + err); + break; + case botDetector.status.READ_OK: + console.log(cons.ok, `read robots list: ${botDetector.count}`); + break; + case botDetector.status.READ_ERROR: + console.error(cons.error, 'error reading robots list : ' + err); + break; + } + }); // set view engine from configuration app.set('view engine', config['view_engine']); @@ -145,6 +162,9 @@ module.exports = (config) => { }); app.use(limiter); + //detect robots + app.use(botDetector.handle); + //log request at result end app.use((req, res, next) => { if (config['access_log']) { @@ -168,7 +188,7 @@ module.exports = (config) => { if (err) { showError(req, res, 404); } else { - hc.count(req, '/', () => { + hc.count(req, '/', req.isRobot, () => { render(req, res, homePath, { articles: Object.values(articles) @@ -271,7 +291,7 @@ module.exports = (config) => { showError(req, res, 404); } } else { - hc.count(req, articlePath, () => { + hc.count(req, articlePath, req.isRobot, () => { renderer.render(article.realPath, (err, html) => { if (err) { console.log(cons.error, `failed to render article ${req.path} : ${err}`); diff --git a/src/bot_detector.js b/src/bot_detector.js new file mode 100644 index 0000000..4e05e4a --- /dev/null +++ b/src/bot_detector.js @@ -0,0 +1,68 @@ +const https = require('https'); +const fs = require('fs'); + +module.exports = (config) => { + const _this = { + status: { + FETCH_OK: 1, + FETCH_ERROR: 2, + READ_OK: 3, + READ_ERROR: 4, + }, + count: [], + regex: null, + knownBots: [], + known: [], + }; + + const fetchList = (cb) => { + https.get(config['robots']['list_url'], (res) => { + if (res.statusCode !== 200) { + cb(res.statusCode); + } else { + const file = fs.createWriteStream(config['robots']['list_file']); + res.pipe(file); + file.on('finish', () => { + file.close(cb); + }); + } + }).on('error', (err) => { + cb(err.message); + }); + }; + + const readFile = (cb) => { + fs.readFile(config['robots']['list_file'], { encoding: 'utf-8' }, (err, data) => { + if (err) { + cb(err, undefined); + } else { + try { + cb(undefined, JSON.parse(data)); + } catch (err) { + cb(err, undefined); + } + } + }); + }; + + _this.load = (cb) => { + fetchList((err) => { + cb(err ? _this.status.FETCH_ERROR : _this.status.FETCH_OK, err); + readFile((err, data) => { + if (!err) { + _this.count = data.length; + _this.regex = new RegExp('(' + data.filter(v => v['pattern']).map(v => v['pattern']) + .join('|') + ')'); + } + cb(err ? _this.status.READ_ERROR : _this.status.READ_OK, err); + }); + }); + }; + + _this.handle = (req, res, next) => { + req.isRobot = !!((req.headers['user-agent'] || '').match(_this.regex)); + next(); + }; + + return _this; +}; diff --git a/src/config.default.json b/src/config.default.json index 49565f9..cc69c16 100644 --- a/src/config.default.json +++ b/src/config.default.json @@ -63,6 +63,10 @@ "hit_counter": { "unique_visitor_timeout": 7200000 }, + "robots": { + "list_url": "https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json", + "list_file": "robots_list.json" + }, "redis": { "host": "localhost", "port": 6379 diff --git a/src/hit_counter.js b/src/hit_counter.js index f2dc707..1b3cb4c 100644 --- a/src/hit_counter.js +++ b/src/hit_counter.js @@ -8,8 +8,8 @@ module.exports = (config, onConnect, onError) => { const visitors = {}; - const count = (req, path, cb) => { - if (!client.connected) { + const count = (req, path, disable, cb) => { + if (!client.connected || disable) { cb(); } else { const ip = req.headers['x-forwarded-for'] || req.connection.remoteAddress; diff --git a/test/bot_detector.test.js b/test/bot_detector.test.js new file mode 100644 index 0000000..60855a5 --- /dev/null +++ b/test/bot_detector.test.js @@ -0,0 +1,116 @@ +const fs = require('fs'); +const utils = require('./test_utils'); + +const dataDir = 'test_data'; + +const config = { + robots: { + list_url: '', + list_file: `${dataDir}/robots_list.json`, + }, +}; + + +beforeAll(() => { + utils.deleteFolderSync(dataDir); + fs.mkdirSync(dataDir); +}); + +afterAll(() => { + if (fs.existsSync(dataDir)) { + utils.deleteFolderSync(dataDir); + } +}); + +const botDetector = require('../src/bot_detector')(config); + +describe('load()', () => { + test('success', (done) => { + config.robots = { + list_url: 'https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json', + list_file: `${dataDir}/robots_list_success.json`, + }; + let count = 0; + botDetector.load((status, err) => { + expect(err).not.toBeDefined(); + expect(status).toBe(count === 0 ? botDetector.status.FETCH_OK : botDetector.status.READ_OK); + if (count > 0) { + done(); + } + count++; + }); + }); + + test('fetch and file failure', (done) => { + let count = 0; + config.robots = { + list_url: 'https://blog.klemek.fr/invalid.json', + list_file: `${dataDir}/robots_list_fail_1.json`, + }; + botDetector.load((status) => { + expect(status).toBe(count === 0 ? botDetector.status.FETCH_ERROR : botDetector.status.READ_ERROR); + if (count > 0) { + done(); + } + count++; + }); + }); + + test('fetch failure and file ok', (done) => { + let count = 0; + config.robots = { + list_url: 'https://blog.klemek.fr/invalid.json', + list_file: `${dataDir}/robots_list_fail_2.json`, + }; + fs.writeFile(config.robots.list_file, '[]\n', { encoding: 'utf-8' }, () => { + botDetector.load((status) => { + expect(status).toBe(count === 0 ? botDetector.status.FETCH_ERROR : botDetector.status.READ_OK); + if (count > 0) { + done(); + } + count++; + }); + }); + }); +}); + + +describe('handle()', () => { + beforeAll((done) => { + config.robots = { + list_url: 'https://blog.klemek.fr/invalid.json', + list_file: `${dataDir}/robots_list_fake.json`, + }; + fs.writeFile(config.robots.list_file, '[{"pattern":"bot"}]\n', { encoding: 'utf-8' }, () => { + botDetector.load((status) => { + if (status !== botDetector.status.FETCH_ERROR) { + done(); + } + }); + }); + }); + + test('not bot', (done) => { + const req = { + headers: { + 'user-agent': 'my user agent', + }, + }; + botDetector.handle(req, null, () => { + expect(req.isRobot).toBeFalsy(); + done(); + }); + }); + + test('bot', (done) => { + const req = { + headers: { + 'user-agent': 'bot', + }, + }; + botDetector.handle(req, null, () => { + expect(req.isRobot).toBeTruthy(); + done(); + }); + }); +}); diff --git a/test/hit_counter.test.js b/test/hit_counter.test.js index c7e3788..a27c0b9 100644 --- a/test/hit_counter.test.js +++ b/test/hit_counter.test.js @@ -98,7 +98,7 @@ describe('read()', () => { hc.count({ headers: {}, connection: { remoteAddress: 'test1' }, - }, '/test/path/5', () => { + }, '/test/path/5', false, () => { hc.read('/test/path/5', (data) => { expect(data).toBeDefined(); expect(data.current_visitors).toBe(1); @@ -111,7 +111,7 @@ describe('read()', () => { hc.count({ headers: {}, connection: { remoteAddress: 'test1' }, - }, '/test/path/5', () => { + }, '/test/path/5', false, () => { hc.read('/test/path/5', (data) => { expect(data).toBeDefined(); expect(data.current_visitors).toBe(0); @@ -145,7 +145,7 @@ describe('count()', () => { hc.count({ headers: {}, connection: { remoteAddress: 'test1' }, - }, '/test/path/1', () => { + }, '/test/path/1', false, () => { expect(multiCalled).toBeTruthy(); expect(hincrbyCalls).toEqual([ [ @@ -177,11 +177,11 @@ describe('count()', () => { hc.count({ headers: {}, connection: { remoteAddress: 'test2' }, - }, '/test/path/2', () => { + }, '/test/path/2', false, () => { hc.count({ headers: {}, connection: { remoteAddress: 'test2' }, - }, '/test/path/2', () => { + }, '/test/path/2', false, () => { expect(hincrbyCalls).toEqual([ [ '/test/path/2', @@ -223,11 +223,11 @@ describe('count()', () => { hc.count({ headers: {}, connection: { remoteAddress: 'test3' }, - }, '/test/path/3', () => { + }, '/test/path/3', false, () => { hc.count({ headers: {}, connection: { remoteAddress: 'test3' }, - }, '/test/path/3', () => { + }, '/test/path/3', false, () => { expect(hincrbyCalls).toEqual([ [ '/test/path/3',