Crawl Data Website Using Nodejs

#javascript #node #html

Today, I'm do simple example "Crawl data web using Nodejs". I want call to URL, after I download all image in URL save to directory

I using library below

npm install request-promise cheerio request fs

request-promise : using Call to URL
cheerio : Dom element html, we can use jquery get value (id,class)
fs : we can read file in directory

Okay, after we setup project success, we need create file and foler following below

create dslink.txt file in directory, crawl list url
create images folder in directory , using save image
create data.json, save info crawl url

Okay, create index.js file

const rp = require("request-promise");
const cheerio = require("cheerio");
const request = require('request');
const fs = require("fs");

function sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

const dslink = "dslink.txt";

var arrayLink = fs.readFileSync(dslink).toString().split("\n");

async function crawler() {
    await sleep(1000);
    for (i in arrayLink) {
        const linkchay = arrayLink[i];

        try {
            const options = {
                uri: linkchay,
                transform: function (body) {

                    return cheerio.load(body);
                },
            };
            var $ = await rp(options);
        } catch (error) {
            console.log("Link dang dung:" + arrayLink[i]);
            return error;
        }

        const title = $(".ten_title").text().trim();
        const tableContent = $(".info_content");
        let data = [];
        // Tên của chương đó.
        let chaperTitle = tableContent.find("p").text().trim();
        let namefile = "";
        let chaperData = []
        const chaperLink = tableContent.find("p").find("img");

        for (let j = 0; j < chaperLink.length; j++) {
            const post = $(chaperLink[j]);
            const postLink = post.attr("src");

            const n = postLink.lastIndexOf("/");

            const filename = postLink.substring(n + 1, postLink.length); 
            namefile = filename;

            download(postLink, filename, function () {
                //console.log("Link:"+linkchay);
            });
            const postTitle = post.text().trim();
            chaperData.push({
                postTitle,
                linkchay,
                filename,
            });
        }
        data.push({
            chaperTitle,
            chaperData,

        });

        fs.writeFileSync('data.json', JSON.stringify(data))
        console.log(linkchay + "------------->done");
        await sleep(1000);
    }

};
//call crawler
crawler();

//call download file
var download = function (uri, filename, callback) {
    request.head(uri, function (err, res, body) {
        console.log('content-type:', res.headers['content-type']);
        console.log('content-length:', res.headers['content-length']);

        request(uri).pipe(fs.createWriteStream('./images/' + filename)).on('close', callback);
    });
};

The post: Crawl Data Website Using Nodejs

DEV Community

Crawl Data Website Using Nodejs

Top comments (0)