Wie man einen Blog in C# scrapen kann;
Verwenden wir Iron WebScraper, um Blog-Inhalte mit C# oder VB.NET zu extrahieren.
Dieses Tutorial zeigt, wie ein WordPress-Blog(oder ähnlich) können mit Hilfe von .NET wieder in Inhalte umgewandelt werden
public class BlogScraper : WebScraper
{
/// <summary>
/// Override this method initializes your web-scraper.
/// Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
/// </summary>
public override void Init()
{
License.LicenseKey = " LicenseKey ";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\BlogSample\Output\";
EnableWebCache(new TimeSpan(1, 30, 30));
this.Request("http://blogSite.com/", Parse);
}
}
public class BlogScraper : WebScraper
{
/// <summary>
/// Override this method initializes your web-scraper.
/// Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
/// </summary>
public override void Init()
{
License.LicenseKey = " LicenseKey ";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\BlogSample\Output\";
EnableWebCache(new TimeSpan(1, 30, 30));
this.Request("http://blogSite.com/", Parse);
}
}
Public Class BlogScraper
Inherits WebScraper
''' <summary>
''' Override this method initializes your web-scraper.
''' Important tasks will be to Request at least one start url... and set allowed/banned domain or url patterns.
''' </summary>
Public Overrides Sub Init()
License.LicenseKey = " LicenseKey "
Me.LoggingLevel = WebScraper.LogLevel.All
Me.WorkingDirectory = AppSetting.GetAppRoot() & "\BlogSample\Output\"
EnableWebCache(New TimeSpan(1, 30, 30))
Me.Request("http://blogSite.com/", Parse)
End Sub
End Class
Wie üblich erstellen wir einen Scraper und erben von der Klasse WebScraper. In diesem Fall ist es "BlogScraper"
Wir legen ein Arbeitsverzeichnis mit dem Namen "\BlogSample\Output\" fest, in dem alle Ausgabe- und Cache-Dateien gespeichert werden können.
Dann aktivieren wir den Webcache, um angeforderte Seiten im Cache-Ordner "WebCache" zu speichern
Lassen Sie uns nun eine Parse-Funktion schreiben:
/// <summary>
/// Override this method to create the default Response handler for your web scraper.
/// If you have multiple page types, you can add additional similar methods.
/// </summary>
/// <param name="response">The http Response object to parse</param>
public override void Parse(Response response)
{
foreach (var link in response.Css("div.section-nav > ul > li > a "))
{
switch(link.TextContentClean)
{
case "Reviews":
{
}break;
case "Science":
{
}break;
default:
{
// Save Result to File
Scrape(new ScrapedData() { { "Title", link.TextContentClean } }, "BlogScraper.Jsonl");
}
break;
}
}
}
/// <summary>
/// Override this method to create the default Response handler for your web scraper.
/// If you have multiple page types, you can add additional similar methods.
/// </summary>
/// <param name="response">The http Response object to parse</param>
public override void Parse(Response response)
{
foreach (var link in response.Css("div.section-nav > ul > li > a "))
{
switch(link.TextContentClean)
{
case "Reviews":
{
}break;
case "Science":
{
}break;
default:
{
// Save Result to File
Scrape(new ScrapedData() { { "Title", link.TextContentClean } }, "BlogScraper.Jsonl");
}
break;
}
}
}
''' <summary>
''' Override this method to create the default Response handler for your web scraper.
''' If you have multiple page types, you can add additional similar methods.
''' </summary>
''' <param name="response">The http Response object to parse</param>
Public Overrides Sub Parse(ByVal response As Response)
For Each link In response.Css("div.section-nav > ul > li > a ")
Select Case link.TextContentClean
Case "Reviews"
Case "Science"
Case Else
' Save Result to File
Scrape(New ScrapedData() From {
{ "Title", link.TextContentClean }
},
"BlogScraper.Jsonl")
End Select
Next link
End Sub
Innerhalb der Parse-Methode; wir analysieren das obere Menü, um alle Links zu allen Kategorieseiten zu erhalten(Filme, Wissenschaft, Rezensionen, etc.)
Dann wechseln wir zu einer geeigneten Parse-Methode auf der Grundlage der Link-Kategorie.
Bereiten wir unser Objektmodell für die Wissenschaftsseite vor:
/// <summary>
/// ScienceModel
/// </summary>
public class ScienceModel
{
/// <summary>
/// Gets or sets the title.
/// </summary>
/// <value>
/// The title.
/// </value>
public string Title { get; set; }
/// <summary>
/// Gets or sets the author.
/// </summary>
/// <value>
/// The author.
/// </value>
public string Author { get; set; }
/// <summary>
/// Gets or sets the date.
/// </summary>
/// <value>
/// The date.
/// </value>
public string Date { get; set; }
/// <summary>
/// Gets or sets the image.
/// </summary>
/// <value>
/// The image.
/// </value>
public string Image { get; set; }
/// <summary>
/// Gets or sets the text.
/// </summary>
/// <value>
/// The text.
/// </value>
public string Text { get; set; }
}
/// <summary>
/// ScienceModel
/// </summary>
public class ScienceModel
{
/// <summary>
/// Gets or sets the title.
/// </summary>
/// <value>
/// The title.
/// </value>
public string Title { get; set; }
/// <summary>
/// Gets or sets the author.
/// </summary>
/// <value>
/// The author.
/// </value>
public string Author { get; set; }
/// <summary>
/// Gets or sets the date.
/// </summary>
/// <value>
/// The date.
/// </value>
public string Date { get; set; }
/// <summary>
/// Gets or sets the image.
/// </summary>
/// <value>
/// The image.
/// </value>
public string Image { get; set; }
/// <summary>
/// Gets or sets the text.
/// </summary>
/// <value>
/// The text.
/// </value>
public string Text { get; set; }
}
''' <summary>
''' ScienceModel
''' </summary>
Public Class ScienceModel
''' <summary>
''' Gets or sets the title.
''' </summary>
''' <value>
''' The title.
''' </value>
Public Property Title() As String
''' <summary>
''' Gets or sets the author.
''' </summary>
''' <value>
''' The author.
''' </value>
Public Property Author() As String
''' <summary>
''' Gets or sets the date.
''' </summary>
''' <value>
''' The date.
''' </value>
Public Property [Date]() As String
''' <summary>
''' Gets or sets the image.
''' </summary>
''' <value>
''' The image.
''' </value>
Public Property Image() As String
''' <summary>
''' Gets or sets the text.
''' </summary>
''' <value>
''' The text.
''' </value>
Public Property Text() As String
End Class
Nun wollen wir einen einzelnen Seiten-Scrape implementieren:
/// <summary>
/// Parses the reviews.
/// </summary>
/// <param name="response">The response.</param>
public void ParseReviews(Response response)
{
// List of Science Link
var scienceList = new List<ScienceModel>();
foreach (var postBox in response.Css("section.main > div > div.post-list"))
{
var item = new ScienceModel();
item.Title = postBox.Css("h1.headline > a")[0].TextContentClean;
item.Author = postBox.Css("div.author > a")[0].TextContentClean;
item.Date = postBox.Css("div.time > a")[0].TextContentClean;
item.Image = postBox.Css("div.image-wrapper.default-state > img")[0].Attributes ["src"];
item.Text = postBox.Css("div.summary > p")[0].TextContentClean;
scienceList.Add(item);
}
Scrape(scienceList, "BlogScience.Jsonl");
}
/// <summary>
/// Parses the reviews.
/// </summary>
/// <param name="response">The response.</param>
public void ParseReviews(Response response)
{
// List of Science Link
var scienceList = new List<ScienceModel>();
foreach (var postBox in response.Css("section.main > div > div.post-list"))
{
var item = new ScienceModel();
item.Title = postBox.Css("h1.headline > a")[0].TextContentClean;
item.Author = postBox.Css("div.author > a")[0].TextContentClean;
item.Date = postBox.Css("div.time > a")[0].TextContentClean;
item.Image = postBox.Css("div.image-wrapper.default-state > img")[0].Attributes ["src"];
item.Text = postBox.Css("div.summary > p")[0].TextContentClean;
scienceList.Add(item);
}
Scrape(scienceList, "BlogScience.Jsonl");
}
''' <summary>
''' Parses the reviews.
''' </summary>
''' <param name="response">The response.</param>
Public Sub ParseReviews(ByVal response As Response)
' List of Science Link
Dim scienceList = New List(Of ScienceModel)()
For Each postBox In response.Css("section.main > div > div.post-list")
Dim item = New ScienceModel()
item.Title = postBox.Css("h1.headline > a")(0).TextContentClean
item.Author = postBox.Css("div.author > a")(0).TextContentClean
item.Date = postBox.Css("div.time > a")(0).TextContentClean
item.Image = postBox.Css("div.image-wrapper.default-state > img")(0).Attributes ("src")
item.Text = postBox.Css("div.summary > p")(0).TextContentClean
scienceList.Add(item)
Next postBox
Scrape(scienceList, "BlogScience.Jsonl")
End Sub
Nachdem wir unser Modell erstellt haben, können wir das Antwortobjekt analysieren, um seine Hauptelemente aufzuschlüsseln(titel, Autor, Datum, Bild, Text)
Dann speichern wir unser Ergebnis in einer separaten Datei mit Kratzen(objekt, Dateiname)
.
Klicken Sie hier für Ahmeds vollständige Anleitung zur Verwendung von IronWebscraper
Erste Schritte mit IronWebscraper
Beginnen Sie noch heute mit der Verwendung von IronWebScraper in Ihrem Projekt mit einer kostenlosen Testversion.