DEV Community

Eko Priyanto
Eko Priyanto

Posted on

scrape data masjid SIMAS Kemenag

get all URL:


<?php

/**
 * =========================================================
 * SIMAS KEMENAG SCRAPER
 * SQLITE + MULTICURL + AUTO RESUME
 * =========================================================
 *
 * FITUR:
 * - multicurl batch aman
 * - sqlite database
 * - auto resume dari sqlite
 * - anti duplicate
 * - anti banned ringan
 * - sanitize utf8
 * - low memory
 * - stable long scraping
 * - restart otomatis tinggal jalankan ulang
 *
 * OUTPUT:
 * simas.db
 *
 * =========================================================
 */

set_time_limit(0);
ini_set('memory_limit', '-1');

date_default_timezone_set('Asia/Jakarta');

// =========================================================
// CONFIG
// =========================================================

$startPage = 1;
$endPage   = 16000;

$batchSize = 5;

$baseUrl =
    "https://simas.kemenag.go.id/page/profilmasjid/0/0/0/0/0?page=";

$dbFile = __DIR__ . '/simas.db';

$failedLog = __DIR__ . '/failed.txt';

// =========================================================
// SQLITE
// =========================================================

$db = new PDO("sqlite:" . $dbFile);

$db->setAttribute(
    PDO::ATTR_ERRMODE,
    PDO::ERRMODE_EXCEPTION
);

// =========================================================
// SQLITE OPTIMIZATION
// =========================================================

$db->exec("
    PRAGMA journal_mode = WAL;
");

$db->exec("
    PRAGMA synchronous = NORMAL;
");

$db->exec("
    PRAGMA cache_size = 100000;
");

$db->exec("
    PRAGMA temp_store = MEMORY;
");

$db->exec("
    PRAGMA busy_timeout = 30000;
");

// =========================================================
// TABLE
// =========================================================

$db->exec("
CREATE TABLE IF NOT EXISTS masjid (
    id INTEGER PRIMARY KEY AUTOINCREMENT,

    page INTEGER,

    nama TEXT,

    alamat TEXT,

    link_detail TEXT UNIQUE,

    link_peta TEXT,

    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
");

// =========================================================
// INDEX
// =========================================================

$db->exec("
CREATE INDEX IF NOT EXISTS idx_page
ON masjid(page)
");

$db->exec("
CREATE INDEX IF NOT EXISTS idx_nama
ON masjid(nama)
");

// =========================================================
// PREPARED STATEMENT
// =========================================================

$insertStmt = $db->prepare("
INSERT OR IGNORE INTO masjid
(
    page,
    nama,
    alamat,
    link_detail,
    link_peta
)
VALUES
(
    :page,
    :nama,
    :alamat,
    :link_detail,
    :link_peta
)
");

// =========================================================
// AUTO RESUME
// =========================================================

$lastPage = $db->query("
    SELECT MAX(page) as last_page
    FROM masjid
")->fetch(PDO::FETCH_ASSOC);

if (!empty($lastPage['last_page'])) {

    $startPage = ((int)$lastPage['last_page']) + 1;

    echo "RESUME FROM PAGE : {$startPage}\n";
}

// =========================================================
// USER AGENT
// =========================================================

$userAgents = [

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/136.0 Safari/537.36',

    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/135.0 Safari/537.36',

    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/134.0 Safari/537.36',

];

// =========================================================
// CLEAN TEXT
// =========================================================

function cleanText($text)
{
    if ($text === null) {
        return '';
    }

    $text = html_entity_decode(
        $text,
        ENT_QUOTES | ENT_HTML5,
        'UTF-8'
    );

    $text = preg_replace(
        '/[\x00-\x1F\x7F]/u',
        ' ',
        $text
    );

    $text = preg_replace(
        '/\s+/u',
        ' ',
        $text
    );

    return trim($text);
}

// =========================================================
// CURL HANDLE
// =========================================================

function createCurlHandle($url, $userAgent)
{
    $ch = curl_init();

    curl_setopt_array($ch, [

        CURLOPT_URL => $url,

        CURLOPT_RETURNTRANSFER => true,

        CURLOPT_FOLLOWLOCATION => true,

        CURLOPT_SSL_VERIFYPEER => false,

        CURLOPT_CONNECTTIMEOUT => 20,

        CURLOPT_TIMEOUT => 60,

        CURLOPT_ENCODING => 'gzip',

        CURLOPT_USERAGENT => $userAgent,

        CURLOPT_HTTPHEADER => [

            'Accept: text/html,application/xhtml+xml',

            'Cache-Control: no-cache',

            'Pragma: no-cache',

        ]

    ]);

    return $ch;
}

// =========================================================
// PARSER
// =========================================================

function parsePage($html, $page)
{
    $rows = [];

    libxml_use_internal_errors(true);

    $dom = new DOMDocument();

    if (!$dom->loadHTML($html)) {
        return [];
    }

    $xpath = new DOMXPath($dom);

    $items = $xpath->query(
        "//div[contains(@class,'search-result-item')]"
    );

    foreach ($items as $item) {

        $nama = '';
        $alamat = '';
        $linkDetail = '';
        $linkPeta = '';

        // =================================================
        // NAMA
        // =================================================

        $namaNode = $xpath->query(
            ".//h4",
            $item
        );

        if ($namaNode->length > 0) {

            $nama = trim(
                $namaNode->item(0)->textContent
            );
        }

        // =================================================
        // ALAMAT
        // =================================================

        $alamatNode = $xpath->query(
            ".//div[contains(@class,'search-result-content')]/p[1]",
            $item
        );

        if ($alamatNode->length > 0) {

            $alamat = trim(
                $alamatNode->item(0)->textContent
            );
        }

        // =================================================
        // LINK DETAIL
        // =================================================

        $detailNode = $xpath->query(
            ".//a[contains(text(),'Lihat Detil')]",
            $item
        );

        if ($detailNode->length > 0) {

            $linkDetail = trim(
                $detailNode
                    ->item(0)
                    ->getAttribute('href')
            );
        }

        // =================================================
        // LINK PETA
        // =================================================

        $mapNode = $xpath->query(
            ".//a[contains(text(),'Lihat di Peta')]",
            $item
        );

        if ($mapNode->length > 0) {

            $linkPeta = trim(
                $mapNode
                    ->item(0)
                    ->getAttribute('href')
            );
        }

        // =================================================
        // VALIDASI
        // =================================================

        if (
            empty($nama) &&
            empty($alamat) &&
            empty($linkDetail)
        ) {
            continue;
        }

        $rows[] = [

            'page' => $page,

            'nama' => cleanText($nama),

            'alamat' => cleanText($alamat),

            'link_detail' => cleanText($linkDetail),

            'link_peta' => cleanText($linkPeta)

        ];
    }

    unset($dom);
    unset($xpath);
    unset($items);

    gc_collect_cycles();

    return $rows;
}

// =========================================================
// MAIN LOOP
// =========================================================

$totalData = 0;
$totalSuccess = 0;
$totalFailed = 0;

$startTime = time();

for ($i = $startPage; $i <= $endPage; $i += $batchSize) {

    echo "\n==================================================\n";
    echo "BATCH : {$i}\n";
    echo "TIME  : " . date('Y-m-d H:i:s') . "\n";

    $multiHandle = curl_multi_init();

    $handles = [];

    // =====================================================
    // CREATE MULTICURL
    // =====================================================

    for ($x = 0; $x < $batchSize; $x++) {

        $page = $i + $x;

        if ($page > $endPage) {
            break;
        }

        $url = $baseUrl . $page;

        echo "QUEUE PAGE {$page}\n";

        $ch = createCurlHandle(
            $url,
            $userAgents[array_rand($userAgents)]
        );

        $handles[$page] = [
            'handle' => $ch,
            'url' => $url
        ];

        curl_multi_add_handle(
            $multiHandle,
            $ch
        );

        usleep(rand(100000, 300000));
    }

    // =====================================================
    // EXECUTE MULTICURL
    // =====================================================

    $running = null;

    do {

        curl_multi_exec(
            $multiHandle,
            $running
        );

        curl_multi_select(
            $multiHandle,
            1
        );

    } while ($running > 0);

    // =====================================================
    // HANDLE RESULT
    // =====================================================

    foreach ($handles as $page => $info) {

        $ch = $info['handle'];

        $html = curl_multi_getcontent($ch);

        $httpCode = curl_getinfo(
            $ch,
            CURLINFO_HTTP_CODE
        );

        $error = curl_error($ch);

        curl_multi_remove_handle(
            $multiHandle,
            $ch
        );

        curl_close($ch);

        // =================================================
        // ERROR CURL
        // =================================================

        if (!empty($error)) {

            echo "ERROR PAGE {$page} : {$error}\n";

            file_put_contents(
                $failedLog,
                $page . PHP_EOL,
                FILE_APPEND
            );

            $totalFailed++;

            continue;
        }

        // =================================================
        // HTTP ERROR
        // =================================================

        if ($httpCode != 200) {

            echo "HTTP {$httpCode} PAGE {$page}\n";

            file_put_contents(
                $failedLog,
                $page . PHP_EOL,
                FILE_APPEND
            );

            $totalFailed++;

            continue;
        }

        // =================================================
        // PARSE HTML
        // =================================================

        $rows = parsePage($html, $page);

        if (count($rows) == 0) {

            echo "EMPTY PAGE {$page}\n";

            file_put_contents(
                $failedLog,
                $page . PHP_EOL,
                FILE_APPEND
            );

            $totalFailed++;

            continue;
        }

        // =================================================
        // INSERT SQLITE
        // =================================================

        $db->beginTransaction();

        foreach ($rows as $row) {

            $insertStmt->execute([

                ':page' => $row['page'],

                ':nama' => $row['nama'],

                ':alamat' => $row['alamat'],

                ':link_detail' => $row['link_detail'],

                ':link_peta' => $row['link_peta']

            ]);

            $totalData++;
        }

        $db->commit();

        echo "SUCCESS PAGE {$page} => "
            . count($rows)
            . " DATA\n";

        $totalSuccess++;

        unset($rows);
        unset($html);
    }

    curl_multi_close($multiHandle);

    // =====================================================
    // COOL DOWN
    // =====================================================

    if ($i % 100 == 0) {

        echo "\nCOOL DOWN 15 DETIK...\n";

        sleep(15);

    } else {

        usleep(rand(700000, 2000000));
    }

    gc_collect_cycles();

    // =====================================================
    // STATS
    // =====================================================

    $elapsed = time() - $startTime;

    echo "\nTOTAL DATA   : {$totalData}\n";
    echo "SUCCESS PAGE : {$totalSuccess}\n";
    echo "FAILED PAGE  : {$totalFailed}\n";
    echo "ELAPSED      : {$elapsed} detik\n";
}

// =========================================================
// FINISH
// =========================================================

$db = null;

$elapsed = time() - $startTime;

echo "\n==================================================\n";
echo "SCRAPING SELESAI\n";
echo "TOTAL DATA   : {$totalData}\n";
echo "SUCCESS PAGE : {$totalSuccess}\n";
echo "FAILED PAGE  : {$totalFailed}\n";
echo "TOTAL TIME   : {$elapsed} detik\n";
echo "DATABASE     : {$dbFile}\n";
echo "FAILED LOG   : {$failedLog}\n";
echo "==================================================\n";

Enter fullscreen mode Exit fullscreen mode

Get detail, update ke table uri


<?php

/**
 * =========================================================
 * SIMAS DETAIL SCRAPER
 * SQLITE + AUTO RESUME + MULTICURL
 * =========================================================
 *
 * REQUIREMENT:
 * composer require symfony/dom-crawler symfony/css-selector
 *
 * FITUR:
 * - ambil semua link dari sqlite
 * - scrape detail otomatis
 * - auto resume
 * - multicurl batch
 * - anti duplicate
 * - aman karakter aneh
 * - prepared statement
 * - rollback transaction
 * - restart tinggal jalankan ulang
 *
 * =========================================================
 */

set_time_limit(0);
ini_set('memory_limit', '-1');

require 'vendor/autoload.php';

use Symfony\Component\DomCrawler\Crawler;

// =========================================================
// SQLITE
// =========================================================

$db = new PDO('sqlite:' . __DIR__ . '/simas.db');

$db->setAttribute(
    PDO::ATTR_ERRMODE,
    PDO::ERRMODE_EXCEPTION
);

// =========================================================
// SQLITE OPTIMIZATION
// =========================================================

$db->exec("PRAGMA journal_mode=WAL");
$db->exec("PRAGMA synchronous=NORMAL");
$db->exec("PRAGMA cache_size=100000");
$db->exec("PRAGMA temp_store=MEMORY");
$db->exec("PRAGMA busy_timeout=30000");

// =========================================================
// TAMBAH KOLOM JIKA BELUM ADA
// =========================================================

$columns = [

    'detail_scraped INTEGER DEFAULT 0',

    'jenis TEXT',
    'no_id_masjid_mushalla TEXT',
    'didirikan_tahun TEXT',

    'alamat_baris_1 TEXT',
    'alamat_baris_2 TEXT',
    'alamat_baris_3 TEXT',

    'alamat_email TEXT',
    'alamat_web TEXT',

    'jumlah_pengurus TEXT',
    'jumlah_imam TEXT',
    'jumlah_khatib TEXT',
    'jumlah_muazin TEXT',
    'jumlah_remaja_masjid TEXT',

    'luas_tanah TEXT',
    'status_tanah TEXT',
    'luas_bangunan TEXT',
    'daya_tampung_jamaah TEXT',

    'sejarah_masjid TEXT',
    'foto_masjid TEXT'
];

$tableInfo = $db->query("
    PRAGMA table_info(masjid)
")->fetchAll(PDO::FETCH_ASSOC);

$existingColumns = [];

foreach ($tableInfo as $col) {
    $existingColumns[] = $col['name'];
}

foreach ($columns as $col) {

    $colName = explode(' ', trim($col))[0];

    if (!in_array($colName, $existingColumns)) {

        $db->exec("
            ALTER TABLE masjid
            ADD COLUMN {$col}
        ");

        echo "ADD COLUMN : {$colName}\n";
    }
}

// =========================================================
// CONFIG
// =========================================================

$batchSize = 5;

$userAgents = [

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/137 Safari/537.36',

    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/136 Safari/537.36',

    'Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/537.36 Chrome/135 Safari/537.36',

];

// =========================================================
// CLEAN TEXT
// =========================================================

function cleanText(?string $text): string
{
    $text = html_entity_decode(
        $text ?? '',
        ENT_QUOTES | ENT_HTML5,
        'UTF-8'
    );

    $text = strip_tags($text);

    $text = preg_replace(
        '/[\x00-\x1F\x7F]/u',
        ' ',
        $text
    );

    $text = preg_replace('/\s+/u', ' ', $text);

    return trim($text);
}

// =========================================================
// GET TEXT
// =========================================================

function getNodeText(
    Crawler $crawler,
    string $selector,
    int $index = 0
): string {

    try {

        return cleanText(
            $crawler
                ->filter($selector)
                ->eq($index)
                ->text()
        );

    } catch (\Throwable $e) {

        return '';
    }
}

// =========================================================
// GET PROFIL VALUE
// =========================================================

function getProfilValue(
    Crawler $crawler,
    string $labelCari
): string {

    $hasil = '';

    try {

        $crawler
            ->filter('.section-content-info-wrapper .row')
            ->each(function ($row)
            use ($labelCari, &$hasil) {

                $cols = $row->filter('.label');

                if ($cols->count() >= 2) {

                    $label = cleanText(
                        $cols->eq(0)->text()
                    );

                    $value = cleanText(
                        $cols->eq(1)->text()
                    );

                    if (
                        stripos($label, $labelCari)
                        !== false
                    ) {

                        $hasil = $value;
                    }
                }
            });

    } catch (\Throwable $e) {
    }

    return $hasil;
}

// =========================================================
// PARSE DETAIL
// =========================================================

function parseMasjidDetail($html)
{
    $crawler = new Crawler($html);

    // =====================================================
    // BASIC
    // =====================================================

    $jenis = getNodeText(
        $crawler,
        '.masjid-sub-title'
    );

    $nama = getNodeText(
        $crawler,
        '.masjid-title'
    );

    $noId = getNodeText(
        $crawler,
        '.masjid-card a'
    );

    // =====================================================
    // TAHUN
    // =====================================================

    $didirikan = '';

    try {

        $crawler
            ->filter('.masjid-alamat-calendar p')
            ->each(function ($node)
            use (&$didirikan) {

                $text = cleanText(
                    $node->text()
                );

                if (
                    preg_match(
                        '/(\d{4})/',
                        $text,
                        $match
                    )
                ) {

                    $didirikan = $match[1];
                }
            });

    } catch (\Throwable $e) {
    }

    // =====================================================
    // ALAMAT
    // =====================================================

    $alamat = '';
    $alamat1 = '';
    $alamat2 = '';
    $alamat3 = '';

    try {

        $alamatNode = $crawler
            ->filter('.masjid-alamat-location p')
            ->first();

        if ($alamatNode->count()) {

            $htmlAlamat = $alamatNode->html();

            $lines = preg_split(
                '/<br\s*\/?>/i',
                $htmlAlamat
            );

            $lines = array_map(function ($v) {

                return cleanText($v);

            }, $lines);

            $lines = array_values(
                array_filter($lines)
            );

            $alamat1 = $lines[0] ?? '';
            $alamat2 = $lines[1] ?? '';
            $alamat3 = $lines[2] ?? '';

            $alamat = implode(', ', $lines);
        }

    } catch (\Throwable $e) {
    }

    // =====================================================
    // EMAIL & WEBSITE
    // =====================================================

    $email = '';
    $website = '';

    try {

        $crawler
            ->filter('.masjid-alamat-phone')
            ->each(function ($node)
            use (&$email, &$website) {

                $icon = '';

                try {

                    $icon = $node
                        ->filter('i')
                        ->attr('class');

                } catch (\Throwable $e) {
                }

                if (
                    strpos($icon, 'ti-email')
                    !== false
                ) {

                    $email = cleanText(
                        $node
                            ->filter('p')
                            ->text()
                    );
                }

                if (
                    strpos($icon, 'ti-desktop')
                    !== false
                ) {

                    $website = cleanText(
                        $node
                            ->filter('p')
                            ->text()
                    );
                }
            });

    } catch (\Throwable $e) {
    }

    // =====================================================
    // SUMMARY
    // =====================================================

    $summary = [];

    try {

        $crawler
            ->filter('.summary-item')
            ->each(function ($node)
            use (&$summary) {

                $judul = cleanText(
                    $node->filter('h4')->text()
                );

                $nilai = cleanText(
                    $node->filter('span')->text()
                );

                $summary[$judul] = $nilai;
            });

    } catch (\Throwable $e) {
    }

    // =====================================================
    // PROFIL
    // =====================================================

    $luasTanah = getProfilValue(
        $crawler,
        'Luas Tanah'
    );

    $statusTanah = getProfilValue(
        $crawler,
        'Status Tanah'
    );

    $luasBangunan = getProfilValue(
        $crawler,
        'Luas Bangunan'
    );

    $dayaTampung = getProfilValue(
        $crawler,
        'Daya Tampung Jamaah'
    );

    // =====================================================
    // SEJARAH
    // =====================================================

    $sejarah = '';

    try {

        $crawler
            ->filter('#content-sejarah')
            ->each(function ($node)
            use (&$sejarah) {

                $sejarah .= ' ' .
                    cleanText(
                        $node->text()
                    );
            });

        $sejarah = trim($sejarah);

    } catch (\Throwable $e) {
    }

    // =====================================================
    // FOTO
    // =====================================================

    $fotos = [];

    try {

        $crawler
            ->filter('.profil-masjid-photos a')
            ->each(function ($node)
            use (&$fotos) {

                $href = trim(
                    $node->attr('href')
                );

                if (!empty($href)) {
                    $fotos[] = $href;
                }
            });

    } catch (\Throwable $e) {
    }

    $fotos = array_unique($fotos);

    return [

        'nama' => $nama,

        'jenis' => $jenis,

        'no_id_masjid_mushalla' => $noId,

        'didirikan_tahun' => $didirikan,

        'alamat' => $alamat,

        'alamat_baris_1' => $alamat1,
        'alamat_baris_2' => $alamat2,
        'alamat_baris_3' => $alamat3,

        'alamat_email' => $email,
        'alamat_web' => $website,

        'jumlah_pengurus' =>
            $summary['Jumlah Pengurus'] ?? '',

        'jumlah_imam' =>
            $summary['Jumlah Imam'] ?? '',

        'jumlah_khatib' =>
            $summary['Jumlah Khatib'] ?? '',

        'jumlah_muazin' =>
            $summary['Jumlah Muazin'] ?? '',

        'jumlah_remaja_masjid' =>
            $summary['Jumlah Remaja Masjid'] ?? '',

        'luas_tanah' => $luasTanah,
        'status_tanah' => $statusTanah,
        'luas_bangunan' => $luasBangunan,
        'daya_tampung_jamaah' => $dayaTampung,

        'sejarah_masjid' => $sejarah,

        'foto_masjid' => implode(
            ' | ',
            $fotos
        )
    ];
}

// =========================================================
// UPDATE QUERY
// =========================================================

$updateStmt = $db->prepare("
UPDATE masjid SET

    nama = :nama,

    jenis = :jenis,

    no_id_masjid_mushalla = :no_id_masjid_mushalla,

    didirikan_tahun = :didirikan_tahun,

    alamat = :alamat,

    alamat_baris_1 = :alamat_baris_1,
    alamat_baris_2 = :alamat_baris_2,
    alamat_baris_3 = :alamat_baris_3,

    alamat_email = :alamat_email,
    alamat_web = :alamat_web,

    jumlah_pengurus = :jumlah_pengurus,
    jumlah_imam = :jumlah_imam,
    jumlah_khatib = :jumlah_khatib,
    jumlah_muazin = :jumlah_muazin,
    jumlah_remaja_masjid = :jumlah_remaja_masjid,

    luas_tanah = :luas_tanah,
    status_tanah = :status_tanah,
    luas_bangunan = :luas_bangunan,
    daya_tampung_jamaah = :daya_tampung_jamaah,

    sejarah_masjid = :sejarah_masjid,

    foto_masjid = :foto_masjid,

    detail_scraped = 1

WHERE id = :id
");

// =========================================================
// TOTAL
// =========================================================

$total = $db->query("
    SELECT COUNT(*)
    FROM masjid
    WHERE detail_scraped = 0
")->fetchColumn();

echo "TOTAL BELUM SCRAPE : {$total}\n";

// =========================================================
// MAIN LOOP
// =========================================================

while (true) {

    $stmt = $db->prepare("
        SELECT id, link_detail
        FROM masjid
        WHERE detail_scraped = 0
        LIMIT :limit
    ");

    $stmt->bindValue(
        ':limit',
        $batchSize,
        PDO::PARAM_INT
    );

    $stmt->execute();

    $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);

    if (!$rows) {
        break;
    }

    echo "\n===================================\n";
    echo "NEW BATCH\n";

    $mh = curl_multi_init();

    $handles = [];

    // =====================================================
    // QUEUE CURL
    // =====================================================

    foreach ($rows as $row) {

        echo "QUEUE : {$row['link_detail']}\n";

        $ch = curl_init();

        curl_setopt_array($ch, [

            CURLOPT_URL => $row['link_detail'],

            CURLOPT_RETURNTRANSFER => true,

            CURLOPT_FOLLOWLOCATION => true,

            CURLOPT_SSL_VERIFYPEER => false,

            CURLOPT_TIMEOUT => 30,

            CURLOPT_ENCODING => 'gzip',

            CURLOPT_USERAGENT =>
                $userAgents[array_rand($userAgents)]

        ]);

        curl_multi_add_handle($mh, $ch);

        $handles[] = [
            'id' => $row['id'],
            'url' => $row['link_detail'],
            'handle' => $ch
        ];

        usleep(rand(100000, 300000));
    }

    // =====================================================
    // EXECUTE
    // =====================================================

    $running = null;

    do {

        curl_multi_exec($mh, $running);

        curl_multi_select($mh);

    } while ($running > 0);

    // =====================================================
    // RESULT
    // =====================================================

    foreach ($handles as $item) {

        $id = $item['id'];

        $url = $item['url'];

        $ch = $item['handle'];

        $html = curl_multi_getcontent($ch);

        $http = curl_getinfo(
            $ch,
            CURLINFO_HTTP_CODE
        );

        $error = curl_error($ch);

        curl_multi_remove_handle($mh, $ch);

        curl_close($ch);

        // =================================================
        // ERROR CURL
        // =================================================

        if (!empty($error)) {

            echo "CURL ERROR : {$url}\n";
            echo $error . "\n";

            continue;
        }

        // =================================================
        // HTTP ERROR
        // =================================================

        if ($http != 200) {

            echo "HTTP {$http} : {$url}\n";

            continue;
        }

        // =================================================
        // PARSE + SAVE
        // =================================================

        try {

            $detail = parseMasjidDetail($html);

            $detail['id'] = $id;

            if ($db->inTransaction()) {
                $db->rollBack();
            }

            $db->beginTransaction();

            $updateStmt->execute($detail);

            $db->commit();

            echo "SUCCESS : {$url}\n";

        } catch (\Throwable $e) {

            if ($db->inTransaction()) {
                $db->rollBack();
            }

            echo "FAILED : {$url}\n";
            echo $e->getMessage() . "\n";
        }

        unset($html);
        unset($detail);

        gc_collect_cycles();
    }

    curl_multi_close($mh);

    echo "COOLDOWN...\n";

    usleep(rand(1000000, 3000000));
}

// =========================================================
// FINISH
// =========================================================

echo "\n===================================\n";
echo "SELESAI\n";
echo "===================================\n";

Enter fullscreen mode Exit fullscreen mode

cara menjalankannya adalah dengan menjalankan di CMD atau terminal

nohup php scrape.php
Enter fullscreen mode Exit fullscreen mode

Top comments (0)