如何在 C&num 中抓取博客;
让我们使用Iron WebScraper通过C#或VB.NET提取博客内容。
此教程展示了如何建立一个WordPress博客(或类似)可使用 .NET 将内容重新刮回
public class BlogScraper : WebScraper
{
/// <summary>
/// Override this method initializes your web-scraper.
/// Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
/// </summary>
public override void Init()
{
License.LicenseKey = " LicenseKey ";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\BlogSample\Output\";
EnableWebCache(new TimeSpan(1, 30, 30));
this.Request("http://blogSite.com/", Parse);
}
}
public class BlogScraper : WebScraper
{
/// <summary>
/// Override this method initializes your web-scraper.
/// Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
/// </summary>
public override void Init()
{
License.LicenseKey = " LicenseKey ";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\BlogSample\Output\";
EnableWebCache(new TimeSpan(1, 30, 30));
this.Request("http://blogSite.com/", Parse);
}
}
Public Class BlogScraper
Inherits WebScraper
''' <summary>
''' Override this method initializes your web-scraper.
''' Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
''' </summary>
Public Overrides Sub Init()
License.LicenseKey = " LicenseKey "
Me.LoggingLevel = WebScraper.LogLevel.All
Me.WorkingDirectory = AppSetting.GetAppRoot() & "\BlogSample\Output\"
EnableWebCache(New TimeSpan(1, 30, 30))
Me.Request("http://blogSite.com/", Parse)
End Sub
End Class
像往常一样,我们创建一个Scraper并从WebScraper类继承。 在这种情况下是“BlogScraper”
我们将工作目录设置为“\BlogSample\Output\”,这是所有输出和缓存文件的存放位置。
然后我们启用Webcache,将请求的页面保存在缓存文件夹“WebCache”中。
现在让我们编写一个解析函数:
/// <summary>
/// Override this method to create the default Response handler for your web scraper.
/// If you have multiple page types, you can add additional similar methods.
/// </summary>
/// <param name="response">The http Response object to parse</param>
public override void Parse(Response response)
{
foreach (var link in response.Css("div.section-nav > ul > li > a "))
{
switch(link.TextContentClean)
{
case "Reviews":
{
}break;
case "Science":
{
}break;
default:
{
// Save Result to File
Scrape(new ScrapedData() { { "Title", link.TextContentClean } }, "BlogScraper.Jsonl");
}
break;
}
}
}
/// <summary>
/// Override this method to create the default Response handler for your web scraper.
/// If you have multiple page types, you can add additional similar methods.
/// </summary>
/// <param name="response">The http Response object to parse</param>
public override void Parse(Response response)
{
foreach (var link in response.Css("div.section-nav > ul > li > a "))
{
switch(link.TextContentClean)
{
case "Reviews":
{
}break;
case "Science":
{
}break;
default:
{
// Save Result to File
Scrape(new ScrapedData() { { "Title", link.TextContentClean } }, "BlogScraper.Jsonl");
}
break;
}
}
}
''' <summary>
''' Override this method to create the default Response handler for your web scraper.
''' If you have multiple page types, you can add additional similar methods.
''' </summary>
''' <param name="response">The http Response object to parse</param>
Public Overrides Sub Parse(ByVal response As Response)
For Each link In response.Css("div.section-nav > ul > li > a ")
Select Case link.TextContentClean
Case "Reviews"
Case "Science"
Case Else
' Save Result to File
Scrape(New ScrapedData() From {
{ "Title", link.TextContentClean }
},
"BlogScraper.Jsonl")
End Select
Next link
End Sub
在解析方法内部; 我们解析顶部菜单以获取指向所有类别页面的所有链接。(电影、科学、评论等)
然后我们根据链接类别切换到合适的解析方法。
让我们为科学页面准备我们的对象模型:
/// <summary>
/// ScienceModel
/// </summary>
public class ScienceModel
{
/// <summary>
/// Gets or sets the title.
/// </summary>
/// <value>
/// The title.
/// </value>
public string Title { get; set; }
/// <summary>
/// Gets or sets the author.
/// </summary>
/// <value>
/// The author.
/// </value>
public string Author { get; set; }
/// <summary>
/// Gets or sets the date.
/// </summary>
/// <value>
/// The date.
/// </value>
public string Date { get; set; }
/// <summary>
/// Gets or sets the image.
/// </summary>
/// <value>
/// The image.
/// </value>
public string Image { get; set; }
/// <summary>
/// Gets or sets the text.
/// </summary>
/// <value>
/// The text.
/// </value>
public string Text { get; set; }
}
/// <summary>
/// ScienceModel
/// </summary>
public class ScienceModel
{
/// <summary>
/// Gets or sets the title.
/// </summary>
/// <value>
/// The title.
/// </value>
public string Title { get; set; }
/// <summary>
/// Gets or sets the author.
/// </summary>
/// <value>
/// The author.
/// </value>
public string Author { get; set; }
/// <summary>
/// Gets or sets the date.
/// </summary>
/// <value>
/// The date.
/// </value>
public string Date { get; set; }
/// <summary>
/// Gets or sets the image.
/// </summary>
/// <value>
/// The image.
/// </value>
public string Image { get; set; }
/// <summary>
/// Gets or sets the text.
/// </summary>
/// <value>
/// The text.
/// </value>
public string Text { get; set; }
}
''' <summary>
''' ScienceModel
''' </summary>
Public Class ScienceModel
''' <summary>
''' Gets or sets the title.
''' </summary>
''' <value>
''' The title.
''' </value>
Public Property Title() As String
''' <summary>
''' Gets or sets the author.
''' </summary>
''' <value>
''' The author.
''' </value>
Public Property Author() As String
''' <summary>
''' Gets or sets the date.
''' </summary>
''' <value>
''' The date.
''' </value>
Public Property [Date]() As String
''' <summary>
''' Gets or sets the image.
''' </summary>
''' <value>
''' The image.
''' </value>
Public Property Image() As String
''' <summary>
''' Gets or sets the text.
''' </summary>
''' <value>
''' The text.
''' </value>
Public Property Text() As String
End Class
现在,让我们执行单页刮擦:
/// <summary>
/// Parses the reviews.
/// </summary>
/// <param name="response">The response.</param>
public void ParseReviews(Response response)
{
// List of Science Link
var scienceList = new List<ScienceModel>();
foreach (var postBox in response.Css("section.main > div > div.post-list"))
{
var item = new ScienceModel();
item.Title = postBox.Css("h1.headline > a")[0].TextContentClean;
item.Author = postBox.Css("div.author > a")[0].TextContentClean;
item.Date = postBox.Css("div.time > a")[0].TextContentClean;
item.Image = postBox.Css("div.image-wrapper.default-state > img")[0].Attributes ["src"];
item.Text = postBox.Css("div.summary > p")[0].TextContentClean;
scienceList.Add(item);
}
Scrape(scienceList, "BlogScience.Jsonl");
}
/// <summary>
/// Parses the reviews.
/// </summary>
/// <param name="response">The response.</param>
public void ParseReviews(Response response)
{
// List of Science Link
var scienceList = new List<ScienceModel>();
foreach (var postBox in response.Css("section.main > div > div.post-list"))
{
var item = new ScienceModel();
item.Title = postBox.Css("h1.headline > a")[0].TextContentClean;
item.Author = postBox.Css("div.author > a")[0].TextContentClean;
item.Date = postBox.Css("div.time > a")[0].TextContentClean;
item.Image = postBox.Css("div.image-wrapper.default-state > img")[0].Attributes ["src"];
item.Text = postBox.Css("div.summary > p")[0].TextContentClean;
scienceList.Add(item);
}
Scrape(scienceList, "BlogScience.Jsonl");
}
''' <summary>
''' Parses the reviews.
''' </summary>
''' <param name="response">The response.</param>
Public Sub ParseReviews(ByVal response As Response)
' List of Science Link
Dim scienceList = New List(Of ScienceModel)()
For Each postBox In response.Css("section.main > div > div.post-list")
Dim item = New ScienceModel()
item.Title = postBox.Css("h1.headline > a")(0).TextContentClean
item.Author = postBox.Css("div.author > a")(0).TextContentClean
item.Date = postBox.Css("div.time > a")(0).TextContentClean
item.Image = postBox.Css("div.image-wrapper.default-state > img")(0).Attributes ("src")
item.Text = postBox.Css("div.summary > p")(0).TextContentClean
scienceList.Add(item)
Next postBox
Scrape(scienceList, "BlogScience.Jsonl")
End Sub
创建模型后,我们可以解析响应对象,深入了解其主要元素(标题、作者、日期、图像、文本)
然后,我们使用 刮削(对象,文件名)
.
点击此处查看艾哈迈德的 IronWebscraper 完整使用教程
开始使用IronWebscraper
立即在您的项目中开始使用IronWebScraper,并享受免费试用。