DEV Community

IronSoftware
IronSoftware

Posted on • Originally published at ironsoftware.com

C# Web Scraper

Iron WebScraper provides a powerful framework to extract data and files from websites using C# code.

  1. Install IronWebScraper to your Project using Nuget
  2. Create a Class Extending WebScraper
  3. Create an Init method that uses the Request method to parse at least one URL.
  4. Create a Parse method to process the requests, and indeed Request more pages. Use response.Css to work with HTML elements using jQuery style CSS selectors
  5. In your application please create and instance of your web scraping class and call the Start(); method
  6. Read our C# webscraping tutorials to learn how to create advanced web crawlers using IronWebScraper

C#:

using IronWebScraper;

namespace WebScrapingProject
{
    class MainClass
    {
        public static void Main(string[] args)
        {
            var scraper = new BlogScraper();
            scraper.Start();
        }
    }

    class BlogScraper : WebScraper
    {
        public override void Init()
        {
            this.LoggingLevel = WebScraper.LogLevel.All;
            this.Request("https://blog.scrapinghub.com", Parse);
        }

        public override void Parse(Response response)
        {
            foreach (var title_link in response.Css("h2.entry-title a"))
            {
                string strTitle = title_link.TextContentClean;
                Scrape(new ScrapedData() { { "Title", strTitle } });
            }

            if (response.CssExists("div.prev-post > a[href]"))
            {
                var next_page = response.Css("div.prev-post > a[href]")[0].Attributes["href"];
                this.Request(next_page, Parse);
            }
        }
    }
}
Enter fullscreen mode Exit fullscreen mode

VB:

Imports IronWebScraper

Namespace WebScrapingProject
    Friend Class MainClass
        Public Shared Sub Main(ByVal args() As String)
            Dim scraper = New BlogScraper()
            scraper.Start()
        End Sub
    End Class

    Friend Class BlogScraper
        Inherits WebScraper

        Public Overrides Sub Init()
            Me.LoggingLevel = WebScraper.LogLevel.All
            Me.Request("https://blog.scrapinghub.com", AddressOf Parse)
        End Sub

        Public Overrides Sub Parse(ByVal response As Response)
            For Each title_link In response.Css("h2.entry-title a")
                Dim strTitle As String = title_link.TextContentClean
                Scrape(New ScrapedData() From {
                    { "Title", strTitle }
                })
            Next title_link

            If response.CssExists("div.prev-post > a[href]") Then
                Dim next_page = response.Css("div.prev-post > a[href]")(0).Attributes("href")
                Me.Request(next_page, AddressOf Parse)
            End If
        End Sub
    End Class
End Namespace
Enter fullscreen mode Exit fullscreen mode

Top comments (0)