Today, I'm do simple example "Crawl data web using Nodejs". I want call to URL, after I download all image in URL save to directory
I using library below
npm install request-promise cheerio request fs
- request-promise : using Call to URL
- cheerio : Dom element html, we can use jquery get value (id,class)
- fs : we can read file in directory
Okay, after we setup project success, we need create file and foler following below
- create dslink.txt file in directory, crawl list url
- create images folder in directory , using save image
- create data.json, save info crawl url
Okay, create index.js file
const rp = require("request-promise");
const cheerio = require("cheerio");
const request = require('request');
const fs = require("fs");
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
const dslink = "dslink.txt";
var arrayLink = fs.readFileSync(dslink).toString().split("\n");
async function crawler() {
await sleep(1000);
for (i in arrayLink) {
const linkchay = arrayLink[i];
try {
const options = {
uri: linkchay,
transform: function (body) {
return cheerio.load(body);
},
};
var $ = await rp(options);
} catch (error) {
console.log("Link dang dung:" + arrayLink[i]);
return error;
}
const title = $(".ten_title").text().trim();
const tableContent = $(".info_content");
let data = [];
// Tên của chương đó.
let chaperTitle = tableContent.find("p").text().trim();
let namefile = "";
let chaperData = []
const chaperLink = tableContent.find("p").find("img");
for (let j = 0; j < chaperLink.length; j++) {
const post = $(chaperLink[j]);
const postLink = post.attr("src");
const n = postLink.lastIndexOf("/");
const filename = postLink.substring(n + 1, postLink.length);
namefile = filename;
download(postLink, filename, function () {
//console.log("Link:"+linkchay);
});
const postTitle = post.text().trim();
chaperData.push({
postTitle,
linkchay,
filename,
});
}
data.push({
chaperTitle,
chaperData,
});
fs.writeFileSync('data.json', JSON.stringify(data))
console.log(linkchay + "------------->done");
await sleep(1000);
}
};
//call crawler
crawler();
//call download file
var download = function (uri, filename, callback) {
request.head(uri, function (err, res, body) {
console.log('content-type:', res.headers['content-type']);
console.log('content-length:', res.headers['content-length']);
request(uri).pipe(fs.createWriteStream('./images/' + filename)).on('close', callback);
});
};
The post: Crawl Data Website Using Nodejs
Top comments (0)