WebScraper/index.js at main · NotIntruder/WebScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
const AdvancedWebScraper = require('./AdvancedWebScraper');
const yargs = require('yargs');
const chalk = require('chalk');
const fs = require('fs-extra');

const argv = yargs
    .option('urls', {
        alias: 'u',
        type: 'array',
        description: 'URLs to scrape from any website',
        demandOption: false
    })
    .option('file', {
        alias: 'f',
        type: 'string',
        description: 'File containing URLs (one per line)',
        demandOption: false
    })
    .option('output', {
        alias: 'o',
        type: 'string',
        description: 'Output directory',
        default: './scraped_data'
    })
    .option('format', {
        type: 'string',
        description: 'Output format (json, csv, text, all)',
        default: 'all',
        choices: ['json', 'csv', 'text', 'all']
    })
    .option('delay', {
        alias: 'd',
        type: 'number',
        description: 'Base delay between requests in milliseconds',
        default: 3000
    })
    .option('browser', {
        alias: 'b',
        type: 'boolean',
        description: 'Enable browser simulation mode for advanced anti-bot evasion',
        default: false
    })
    .option('stealth', {
        alias: 's',
        type: 'boolean',
        description: 'Enable maximum stealth mode (forces browser simulation)',
        default: false
    })
    .option('viewport', {
        type: 'string',
        description: 'Custom viewport size (e.g., 1920x1080)',
        default: null
    })
    .option('human', {
        alias: 'h',
        type: 'boolean',
        description: 'Enable human behavior simulation (browser + enhanced delays + stealth)',
        default: false
    })
    .option('batch-delay', {
        type: 'number',
        description: 'Additional delay between each URL in batch processing (ms)',
        default: 0
    })
    .option('continue-on-error', {
        type: 'boolean',
        description: 'Continue processing remaining URLs if one fails',
        default: true
    })
    .option('max-concurrent', {
        type: 'number',
        description: 'Maximum number of concurrent requests (1 for sequential)',
        default: 1
    })
    .help()
    .argv;

async function main() {
    console.log(chalk.cyan('🏛️  Advanced Web Scraper v2.0'));
    console.log(chalk.cyan('==================================\n'));

    // Parse viewport if provided
    let viewport = null;
    if (argv.viewport) {
        const [width, height] = argv.viewport.split('x').map(Number);
        if (width && height) {
            viewport = { width, height };
        } else {
            console.log(chalk.yellow('⚠️  Invalid viewport format. Use format: 1920x1080'));
        }
    }

    let enhancedDelay = argv.delay;
    let useBrowserMode = argv.browser || argv.stealth || argv.human;

    if (argv.human) {
        enhancedDelay = Math.max(argv.delay, 5000);
        if (!argv['batch-delay']) {
            argv['batch-delay'] = 3000;
        }
    }

    const scraper = new AdvancedWebScraper({
        outputDir: argv.output,
        delay: enhancedDelay,
        useBrowser: useBrowserMode,
        humanMode: argv.human
    });

    if (viewport) {
        scraper.customViewport = viewport;
    }

    if (argv.human) {
        console.log(chalk.blue('👤 Human behavior simulation enabled'));
        console.log(chalk.gray('   🧠 Enhanced delays for natural behavior'));
        console.log(chalk.gray('   🖱️  Mouse movements and scrolling simulation'));
        console.log(chalk.gray('   👁️  Reading pattern simulation'));
        console.log(chalk.gray('   🔄 Advanced User-Agent rotation'));
    } else if (argv.stealth) {
        console.log(chalk.blue('🥷 Maximum stealth mode enabled'));
    } else if (argv.browser) {
        console.log(chalk.blue('🤖 Browser simulation mode enabled'));
    }

    // Display configuration after setup
    const stats = scraper.getStats();
    console.log(chalk.gray(`📊 Configuration:`));
    console.log(chalk.gray(`   User-Agents: ${stats.userAgentsAvailable} available`));

    let modeDescription = 'HTTP Only';
    if (argv.human) modeDescription = 'Human Simulation';
    else if (argv.stealth) modeDescription = 'Maximum Stealth';
    else if (argv.browser) modeDescription = 'Browser Automation';

    console.log(chalk.gray(`   Mode: ${modeDescription}`));
    console.log(chalk.gray(`   Base Delay: ${enhancedDelay}ms`));
    console.log(chalk.gray(`   Output Format: ${argv.format}`));
    console.log('');

    let urls = [];

    if (argv.urls) {
        urls = argv.urls;
    } else if (argv.file) {
        try {
            const fileContent = await fs.readFile(argv.file, 'utf8');

            // Enhanced URL parsing with better validation
            const rawUrls = fileContent.split('\n')
                                     .map(line => line.trim())
                                     .filter(line => line && !line.startsWith('#')) // Allow comments
                                     .filter(line => line.startsWith('http'));

            urls = rawUrls;

            console.log(chalk.blue(`📄 Loaded ${urls.length} URLs from ${argv.file}`));

            // Show preview of URLs if there are many
            if (urls.length > 5) {
                console.log(chalk.gray('   Preview:'));
                urls.slice(0, 3).forEach((url, i) => {
                    console.log(chalk.gray(`     ${i + 1}. ${url.substring(0, 60)}${url.length > 60 ? '...' : ''}`));
                });
                console.log(chalk.gray(`     ... and ${urls.length - 3} more URLs`));
            } else {
                urls.forEach((url, i) => {
                    console.log(chalk.gray(`   ${i + 1}. ${url}`));
                });
            }
            console.log('');

        } catch (error) {
            console.error(chalk.red(`❌ Error reading file ${argv.file}: ${error.message}`));
            console.log(chalk.yellow('\n💡 Tips for creating a URL file:'));
            console.log(chalk.gray('  • One URL per line'));
            console.log(chalk.gray('  • URLs must start with http:// or https://'));
            console.log(chalk.gray('  • Lines starting with # are treated as comments'));
            console.log(chalk.gray('  • Empty lines are ignored'));
            process.exit(1);
        }
    } else {
        console.log(chalk.yellow('📋 No URLs provided. Please specify URLs to scrape.'));
        console.log(chalk.blue('\nUsage examples:'));
        console.log(chalk.gray('  node index.js --urls https://example.com/page1 https://example.com/page2'));
        console.log(chalk.gray('  node index.js --file urls.txt'));
        console.log(chalk.gray('  node index.js --file batch_urls.txt --stealth --batch-delay 5000'));
        console.log(chalk.gray('  node index.js --urls https://some-wiki.com/page --delay 5000'));

        console.log(chalk.blue('\n📝 URL file format:'));
        console.log(chalk.gray('  # This is a comment'));
        console.log(chalk.gray('  https://site1.com/page1'));
        console.log(chalk.gray('  https://site2.com/page2'));
        console.log(chalk.gray('  # Another comment'));
        console.log(chalk.gray('  https://site3.com/page3'));
        process.exit(0);
    }

    if (urls.length === 0) {
        console.error(chalk.red('❌ No valid URLs found to scrape.'));
        process.exit(1);
    }

    console.log(chalk.green(`🎯 Found ${urls.length} URLs to scrape:`));
    urls.forEach((url, index) => {
        console.log(chalk.gray(`  ${index + 1}. ${url}`));
    });
    console.log();

    try {
        // Prepare batch processing options
        const batchOptions = {
            batchDelay: argv['batch-delay'] || 0,
            continueOnError: argv['continue-on-error'],
            maxConcurrent: argv['max-concurrent'] || 1,
            showProgress: true
        };

        console.log(chalk.blue(`📊 Batch Processing Configuration:`));
        console.log(chalk.gray(`   Continue on Error: ${batchOptions.continueOnError ? 'Yes' : 'No'}`));
        console.log(chalk.gray(`   Max Concurrent: ${batchOptions.maxConcurrent}`));
        if (batchOptions.batchDelay > 0) {
            console.log(chalk.gray(`   Batch Delay: ${batchOptions.batchDelay}ms`));
        }
        console.log('');

        const results = await scraper.scrapeMultiplePages(urls, argv.format, batchOptions);

        if (results.length > 0) {
            console.log(chalk.green('\n✅ Scraping completed successfully!'));
            console.log(chalk.cyan('\n📂 Generated files:'));
            console.log(chalk.gray('  ├── json/ (structured data for each page)'));
            console.log(chalk.gray('  ├── csv/ (tabular data)'));
            console.log(chalk.gray('  ├── text/ (plain text content)'));
            console.log(chalk.gray('  ├── consolidated_*.json (all data combined)'));
            console.log(chalk.gray('  ├── consolidated_*.csv (all data in CSV)'));
            console.log(chalk.gray('  ├── consolidated_*.txt (all text combined)'));
            console.log(chalk.gray('  └── training_dataset_*.jsonl (AI training format)'));
        } else {
            console.log(chalk.red('\n❌ No pages were successfully scraped.'));
            console.log(chalk.yellow('This may be due to:'));
            console.log(chalk.gray('  • Anti-bot protection blocking requests'));
            console.log(chalk.gray('  • Invalid or non-existent URLs'));
            console.log(chalk.gray('  • Network connectivity issues'));
            console.log(chalk.gray('  • Server-side restrictions'));
        }

    } catch (error) {
        console.error(chalk.red(`\n❌ Error during scraping: ${error.message}`));

        // Ensure cleanup even on error
        try {
            await scraper.cleanup();
        } catch (cleanupError) {
            console.error(chalk.gray(`Warning: Cleanup error: ${cleanupError.message}`));
        }

        process.exit(1);
    } finally {
        // Always perform cleanup
        try {
            await scraper.cleanup();
        } catch (cleanupError) {
            console.error(chalk.gray(`Warning: Cleanup error: ${cleanupError.message}`));
        }
    }
}

// Handle unhandled promise rejections
process.on('unhandledRejection', async (reason, promise) => {
    console.error(chalk.red('Unhandled Rejection at:', promise, 'reason:', reason));
    process.exit(1);
});

// Handle uncaught exceptions
process.on('uncaughtException', async (error) => {
    console.error(chalk.red('Uncaught Exception:', error));
    process.exit(1);
});

// Handle process termination signals
process.on('SIGINT', async () => {
    console.log(chalk.yellow('\n⚠️  Received SIGINT, cleaning up...'));
    process.exit(0);
});

process.on('SIGTERM', async () => {
    console.log(chalk.yellow('\n⚠️  Received SIGTERM, cleaning up...'));
    process.exit(0);
});

if (require.main === module) {
    main();
}

module.exports = { AdvancedWebScraper };