如何在C#中抓取博客

艾哈邁德·阿布爾馬格德

讓我們使用 Iron WebScraper 來使用 C# 或 VB.NET 提取博客內容。

本教程展示如何使用WordPress博客(或類似)可以使用 .NET 將內容重新抓取回來

FireShotScreenCaptureGizmodo related to 如何在C#中抓取博客

public class BlogScraper : WebScraper
{
    /// <summary>
    /// Override this method initializes your web-scraper.
    /// Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
    /// </summary>
    public override void Init()
    {
        License.LicenseKey = " LicenseKey ";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\BlogSample\Output\";
        EnableWebCache(new TimeSpan(1, 30, 30));
        this.Request("http://blogSite.com/", Parse);
    }
}
public class BlogScraper : WebScraper
{
    /// <summary>
    /// Override this method initializes your web-scraper.
    /// Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
    /// </summary>
    public override void Init()
    {
        License.LicenseKey = " LicenseKey ";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\BlogSample\Output\";
        EnableWebCache(new TimeSpan(1, 30, 30));
        this.Request("http://blogSite.com/", Parse);
    }
}
Public Class BlogScraper
	Inherits WebScraper

	''' <summary>
	''' Override this method initializes your web-scraper.
	''' Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
	''' </summary>
	Public Overrides Sub Init()
		License.LicenseKey = " LicenseKey "
		Me.LoggingLevel = WebScraper.LogLevel.All
		Me.WorkingDirectory = AppSetting.GetAppRoot() & "\BlogSample\Output\"
		EnableWebCache(New TimeSpan(1, 30, 30))
		Me.Request("http://blogSite.com/", Parse)
	End Sub
End Class
VB   C#

像往常一樣,我們創建一個Scraper並繼承自WebScraper類。 在這種情況下,它是“BlogScraper”。

我们将工作目录设置为“\BlogSample\Output\”,所有输出和缓存文件都可以放在这里。

然後我們啟用 Webcache 將請求的頁面保存在緩存文件夾“WebCache”中。

現在讓我們編寫一個解析函數:

/// <summary>
/// Override this method to create the default Response handler for your web scraper.
/// If you have multiple page types, you can add additional similar methods.
/// </summary>
/// <param name="response">The http Response object to parse</param>
public override void Parse(Response response)
{
    foreach (var link in response.Css("div.section-nav > ul > li > a "))
    {
        switch(link.TextContentClean)
        {
            case "Reviews":
                {

                }break;
            case "Science":
                {

                }break;
            default:
                {
                    // Save Result to File
                    Scrape(new ScrapedData() { { "Title", link.TextContentClean } }, "BlogScraper.Jsonl");
                }
                break;
        }
    }
}
/// <summary>
/// Override this method to create the default Response handler for your web scraper.
/// If you have multiple page types, you can add additional similar methods.
/// </summary>
/// <param name="response">The http Response object to parse</param>
public override void Parse(Response response)
{
    foreach (var link in response.Css("div.section-nav > ul > li > a "))
    {
        switch(link.TextContentClean)
        {
            case "Reviews":
                {

                }break;
            case "Science":
                {

                }break;
            default:
                {
                    // Save Result to File
                    Scrape(new ScrapedData() { { "Title", link.TextContentClean } }, "BlogScraper.Jsonl");
                }
                break;
        }
    }
}
''' <summary>
''' Override this method to create the default Response handler for your web scraper.
''' If you have multiple page types, you can add additional similar methods.
''' </summary>
''' <param name="response">The http Response object to parse</param>
Public Overrides Sub Parse(ByVal response As Response)
	For Each link In response.Css("div.section-nav > ul > li > a ")
		Select Case link.TextContentClean
			Case "Reviews"

			Case "Science"

			Case Else
					' Save Result to File
					Scrape(New ScrapedData() From {
						{ "Title", link.TextContentClean }
					},
					"BlogScraper.Jsonl")
		End Select
	Next link
End Sub
VB   C#

在 parse 方法內部; 我们解析顶部菜单以获取所有类别页面的所有链接。(電影、科學、評論等。)

然後我們根據連結類別切換到合適的解析方法。

讓我們為科學頁面準備我們的對象模型:

/// <summary>
/// ScienceModel
/// </summary>
public class ScienceModel
{
    /// <summary>
    /// Gets or sets the title.
    /// </summary>
    /// <value>
    /// The title.
    /// </value>
    public string Title { get; set; }
    /// <summary>
    /// Gets or sets the author.
    /// </summary>
    /// <value>
    /// The author.
    /// </value>
    public string Author { get; set; }
    /// <summary>
    /// Gets or sets the date.
    /// </summary>
    /// <value>
    /// The date.
    /// </value>
    public string Date { get; set; }
    /// <summary>
    /// Gets or sets the image.
    /// </summary>
    /// <value>
    /// The image.
    /// </value>
    public string Image { get; set; }
    /// <summary>
    /// Gets or sets the text.
    /// </summary>
    /// <value>
    /// The text.
    /// </value>
    public string Text { get; set; }

}
/// <summary>
/// ScienceModel
/// </summary>
public class ScienceModel
{
    /// <summary>
    /// Gets or sets the title.
    /// </summary>
    /// <value>
    /// The title.
    /// </value>
    public string Title { get; set; }
    /// <summary>
    /// Gets or sets the author.
    /// </summary>
    /// <value>
    /// The author.
    /// </value>
    public string Author { get; set; }
    /// <summary>
    /// Gets or sets the date.
    /// </summary>
    /// <value>
    /// The date.
    /// </value>
    public string Date { get; set; }
    /// <summary>
    /// Gets or sets the image.
    /// </summary>
    /// <value>
    /// The image.
    /// </value>
    public string Image { get; set; }
    /// <summary>
    /// Gets or sets the text.
    /// </summary>
    /// <value>
    /// The text.
    /// </value>
    public string Text { get; set; }

}
''' <summary>
''' ScienceModel
''' </summary>
Public Class ScienceModel
	''' <summary>
	''' Gets or sets the title.
	''' </summary>
	''' <value>
	''' The title.
	''' </value>
	Public Property Title() As String
	''' <summary>
	''' Gets or sets the author.
	''' </summary>
	''' <value>
	''' The author.
	''' </value>
	Public Property Author() As String
	''' <summary>
	''' Gets or sets the date.
	''' </summary>
	''' <value>
	''' The date.
	''' </value>
	Public Property [Date]() As String
	''' <summary>
	''' Gets or sets the image.
	''' </summary>
	''' <value>
	''' The image.
	''' </value>
	Public Property Image() As String
	''' <summary>
	''' Gets or sets the text.
	''' </summary>
	''' <value>
	''' The text.
	''' </value>
	Public Property Text() As String

End Class
VB   C#

現在讓我們實現單頁抓取:

/// <summary>
/// Parses the reviews.
/// </summary>
/// <param name="response">The response.</param>
public void ParseReviews(Response response)
{
    // List of Science Link
    var scienceList = new List<ScienceModel>();

    foreach (var postBox in response.Css("section.main > div > div.post-list"))
    {
        var item = new ScienceModel();
        item.Title = postBox.Css("h1.headline > a")[0].TextContentClean;
        item.Author = postBox.Css("div.author > a")[0].TextContentClean;
        item.Date = postBox.Css("div.time > a")[0].TextContentClean;
        item.Image = postBox.Css("div.image-wrapper.default-state > img")[0].Attributes ["src"];
        item.Text = postBox.Css("div.summary > p")[0].TextContentClean;
        scienceList.Add(item);
    }

    Scrape(scienceList, "BlogScience.Jsonl");
}
/// <summary>
/// Parses the reviews.
/// </summary>
/// <param name="response">The response.</param>
public void ParseReviews(Response response)
{
    // List of Science Link
    var scienceList = new List<ScienceModel>();

    foreach (var postBox in response.Css("section.main > div > div.post-list"))
    {
        var item = new ScienceModel();
        item.Title = postBox.Css("h1.headline > a")[0].TextContentClean;
        item.Author = postBox.Css("div.author > a")[0].TextContentClean;
        item.Date = postBox.Css("div.time > a")[0].TextContentClean;
        item.Image = postBox.Css("div.image-wrapper.default-state > img")[0].Attributes ["src"];
        item.Text = postBox.Css("div.summary > p")[0].TextContentClean;
        scienceList.Add(item);
    }

    Scrape(scienceList, "BlogScience.Jsonl");
}
''' <summary>
''' Parses the reviews.
''' </summary>
''' <param name="response">The response.</param>
Public Sub ParseReviews(ByVal response As Response)
	' List of Science Link
	Dim scienceList = New List(Of ScienceModel)()

	For Each postBox In response.Css("section.main > div > div.post-list")
		Dim item = New ScienceModel()
		item.Title = postBox.Css("h1.headline > a")(0).TextContentClean
		item.Author = postBox.Css("div.author > a")(0).TextContentClean
		item.Date = postBox.Css("div.time > a")(0).TextContentClean
		item.Image = postBox.Css("div.image-wrapper.default-state > img")(0).Attributes ("src")
		item.Text = postBox.Css("div.summary > p")(0).TextContentClean
		scienceList.Add(item)
	Next postBox

	Scrape(scienceList, "BlogScience.Jsonl")
End Sub
VB   C#

在我們創建模型之後,我們可以解析響應對象以深入其主要元素。(標題, 作者, 日期, 圖片, 文字)

然後,我們使用另存並儲存結果到單獨的文件中。 抓取(對象, 檔案名稱).

點擊這裡查看阿赫瑪德關於使用IronWebscraper的完整教程

開始使用IronWebscraper

立即在您的專案中使用IronWebScraper,並享受免費試用。

第一步:
green arrow pointer


網頁抓取從來不是一項簡單的任務,在C#或.NET編程環境中沒有主導框架。Iron Web Scraper的出現改變了這一現狀。
.NET軟體工程師 對很多人來說,這是從 .NET 生成 PDF 文件的最有效方法,因為不需要學習額外的 API 或導航複雜的設計系統

艾哈邁德·阿布爾馬格德

跨國IT公司 .NET 軟體解決方案架構師

Ahmed 是一位經驗豐富且具有認證的微軟技術專家,擁有超過10年在 IT 和軟體開發領域的經驗。他曾在多家不同公司工作,現在是某跨國 IT 公司的國家經理。

Ahmed已經使用IronPDF和IronWebScraper超過一年,並在他公司中的多個專案中使用它們。