DEV Community

artydev
artydev

Posted on

1

Web Scraping With C#

Here is a nice article from Oxylab Web Scraping With C#

You can try this code in VSCODE :

using System.Globalization;
using CsvHelper;
using HtmlAgilityPack;

namespace webscraping
{
    public class Book
    {
        public string? Title { get; set; }
        public string? Price { get; set; }
    }
    class Program
    {
        static HtmlDocument GetDocument(string url)
        {
            HtmlWeb web = new HtmlWeb();
            HtmlDocument doc = web.Load(url);
            return doc;
        }
        static List<string> GetBookLinks(string url)
        {
            var bookLinks = new List<string>();
            HtmlDocument doc = GetDocument(url);
            HtmlNodeCollection linkNodes = doc.DocumentNode.SelectNodes("//h3/a");
            var baseUri = new Uri(url);
            foreach (var link in linkNodes)
            {
                string href = link.Attributes["href"].Value;
                bookLinks.Add(new Uri(baseUri, href).AbsoluteUri);
            }
            return bookLinks;
        }
        static List<Book> GetBookDetails(List<string> urls)
        {
            var books = new List<Book>();
            foreach (var url in urls)
            {
                HtmlDocument document = GetDocument(url);
                var titleXPath = "//h1";
                var priceXPath = "//div[contains(@class,\"product_main\")]/p[@class=\"price_color\"]";
                var book = new Book();
                book.Title = document.DocumentNode.SelectSingleNode(titleXPath).InnerText;
                book.Price = document.DocumentNode.SelectSingleNode(priceXPath).InnerText;
                books.Add(book);
            }
            return books;
        }
        static void exportToCSV(List<Book> books)
        {
            using (var writer = new StreamWriter("books.csv"))
            using (var csv = new CsvWriter(writer, CultureInfo.InvariantCulture))
            {
                csv.WriteRecords(books);
            }
        }

        static void Main(string[] args)
        {
            var bookLinks = GetBookLinks("http://books.toscrape.com/catalogue/category/books/mystery_3/index.html");
            Console.WriteLine("Found {0} links", bookLinks.Count);
            var books = GetBookDetails(bookLinks);
            exportToCSV(books);
        }
    }
}
Enter fullscreen mode Exit fullscreen mode

For scrapping dynamic web sites, you can use Selenium, as described here:

using System.Globalization;
using CsvHelper;
using OpenQA.Selenium;
using OpenQA.Selenium.Firefox;
using WebDriverManager;
using WebDriverManager.DriverConfigs.Impl;

namespace webscraping

{
    public class Quote
    {
        public string? Text { get; set; }
        public string? Author { get; set; }
        public override string ToString()
        {
            return Author + " says, " + Text;
        }
    }


    public class Program
    {
        static void Main(string[] args)
        {
            new DriverManager().SetUpDriver(new FirefoxConfig());

            FirefoxOptions options = new FirefoxOptions();
            options.AddArgument("--headless");

            var driver = new FirefoxDriver(options);

            driver.Navigate().GoToUrl("http://quotes.toscrape.com/js/");

            var quotes = new List<Quote>();

            var quoteContainers = driver.FindElements(By.CssSelector("div.quote"));

            foreach (var item in quoteContainers)
            {
                Quote quote = new()
                {
                    Text = item.FindElement(By.CssSelector("span.text")).Text,
                    Author = item.FindElement(By.CssSelector(".author")).Text
                };
                quotes.Add(quote);
                Console.WriteLine(quote.ToString());
            }


            using (var writer = new StreamWriter("c:\\temp\\quotes.csv"))
            using (var csv = new CsvWriter(writer, CultureInfo.InvariantCulture))
            {
                try
                {
                    csv.WriteRecords(quotes);
                    csv.Flush();

                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.ToString());
                }
            }

            var img = driver.GetFullPageScreenshot();
            // Close the driver

            driver.Quit();

        }
    }
}
Enter fullscreen mode Exit fullscreen mode

AWS GenAI LIVE image

How is generative AI increasing efficiency?

Join AWS GenAI LIVE! to find out how gen AI is reshaping productivity, streamlining processes, and driving innovation.

Learn more

Top comments (0)

Billboard image

Create up to 10 Postgres Databases on Neon's free plan.

If you're starting a new project, Neon has got your databases covered. No credit cards. No trials. No getting in your way.

Try Neon for Free →

👋 Kindness is contagious

Please leave a ❤️ or a friendly comment on this post if you found it helpful!

Okay