從線上電影網站擷取資料

This article was translated from English: Does it need improvement?
Translated
View the article in English

讓我們從真實世界的網站開始另一個例子。我們將選擇擷取一個電影網站的資料。

讓我們新增一個類別並命名為“MovieScraper”:

現在讓我們來看看我們將要抓取的網站:

這是我們在網站上看到的首頁 HTML 的一部分:

<div id="movie-featured" class="movies-list movies-list-full tab-pane in fade active">
    <div data-movie-id="20746" class="ml-item">
        <a href="https://website.com/film/king-arthur-legend-of-the-sword-20746/">
            <span class="mli-quality">CAM</span>
            <img data-original="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg" 
                 class="lazy thumb mli-thumb" alt="King Arthur: Legend of the Sword"
                  src="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg" 
                 style="display: inline-block;">
            <span class="mli-info"><h2>King Arthur: Legend of the Sword</h2></span>
        </a>
    </div>
    <div data-movie-id="20724" class="ml-item">
        <a href="https://website.com/film/snatched-20724/" >
            <span class="mli-quality">CAM</span>
            <img data-original="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg" 
                 class="lazy thumb mli-thumb" alt="Snatched" 
                 src="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg" 
                 style="display: inline-block;">
            <span class="mli-info"><h2>Snatched</h2></span>
        </a>
    </div>
</div>
<div id="movie-featured" class="movies-list movies-list-full tab-pane in fade active">
    <div data-movie-id="20746" class="ml-item">
        <a href="https://website.com/film/king-arthur-legend-of-the-sword-20746/">
            <span class="mli-quality">CAM</span>
            <img data-original="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg" 
                 class="lazy thumb mli-thumb" alt="King Arthur: Legend of the Sword"
                  src="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg" 
                 style="display: inline-block;">
            <span class="mli-info"><h2>King Arthur: Legend of the Sword</h2></span>
        </a>
    </div>
    <div data-movie-id="20724" class="ml-item">
        <a href="https://website.com/film/snatched-20724/" >
            <span class="mli-quality">CAM</span>
            <img data-original="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg" 
                 class="lazy thumb mli-thumb" alt="Snatched" 
                 src="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg" 
                 style="display: inline-block;">
            <span class="mli-info"><h2>Snatched</h2></span>
        </a>
    </div>
</div>
HTML

如我們所見,我們擁有一個電影 ID、標題和詳細頁面的鏈接。

讓我們開始抓取這組資料:

public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("www.website.com", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var MovieId = Divs.GetAttribute("data-movie-id");
                var link = Divs.Css("a")[0];
                var MovieTitle = link.TextContentClean;
                Scrape(new ScrapedData() { { "MovieId", MovieId }, { "MovieTitle", MovieTitle } }, "Movie.Jsonl");
            }
        }           
    }
}
public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("www.website.com", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var MovieId = Divs.GetAttribute("data-movie-id");
                var link = Divs.Css("a")[0];
                var MovieTitle = link.TextContentClean;
                Scrape(new ScrapedData() { { "MovieId", MovieId }, { "MovieTitle", MovieTitle } }, "Movie.Jsonl");
            }
        }           
    }
}
Public Class MovieScraper
	Inherits WebScraper

	Public Overrides Sub Init()
		License.LicenseKey = "LicenseKey"
		Me.LoggingLevel = WebScraper.LogLevel.All
		Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
		Me.Request("www.website.com", AddressOf Parse)
	End Sub
	Public Overrides Sub Parse(ByVal response As Response)
		For Each Divs In response.Css("#movie-featured > div")
			If Divs.Attributes ("class") <> "clearfix" Then
				Dim MovieId = Divs.GetAttribute("data-movie-id")
				Dim link = Divs.Css("a")(0)
				Dim MovieTitle = link.TextContentClean
				Scrape(New ScrapedData() From {
					{ "MovieId", MovieId },
					{ "MovieTitle", MovieTitle }
				},
				"Movie.Jsonl")
			End If
		Next Divs
	End Sub
End Class
VB   C#

這段程式碼有什麼新功能?

Working Directory屬性用於設置所有抓取數據及其相關文件的主要工作目錄。

讓我們做更多。

如果我們需要建立類型化的物件來保存格式化的抓取數據該怎麼辦?

讓我們實作一個電影類別來保存我們的格式化數據:

public class Movie
{
    public int Id { get; set; }
    public string Title { get; set; }
    public string URL { get; set; }

}
public class Movie
{
    public int Id { get; set; }
    public string Title { get; set; }
    public string URL { get; set; }

}
Public Class Movie
	Public Property Id() As Integer
	Public Property Title() As String
	Public Property URL() As String

End Class
VB   C#

現在我們將更新我們的代碼:

public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("https://website.com/", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var movie = new Movie();
                movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
                var link = Divs.Css("a")[0];
                movie.Title = link.TextContentClean;
                movie.URL = link.Attributes ["href"];
                Scrape(movie, "Movie.Jsonl");
            }
        }
    }
}
public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("https://website.com/", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var movie = new Movie();
                movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
                var link = Divs.Css("a")[0];
                movie.Title = link.TextContentClean;
                movie.URL = link.Attributes ["href"];
                Scrape(movie, "Movie.Jsonl");
            }
        }
    }
}
Public Class MovieScraper
	Inherits WebScraper

	Public Overrides Sub Init()
		License.LicenseKey = "LicenseKey"
		Me.LoggingLevel = WebScraper.LogLevel.All
		Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
		Me.Request("https://website.com/", AddressOf Parse)
	End Sub
	Public Overrides Sub Parse(ByVal response As Response)
		For Each Divs In response.Css("#movie-featured > div")
			If Divs.Attributes ("class") <> "clearfix" Then
				Dim movie As New Movie()
				movie.Id = Convert.ToInt32(Divs.GetAttribute("data-movie-id"))
				Dim link = Divs.Css("a")(0)
				movie.Title = link.TextContentClean
				movie.URL = link.Attributes ("href")
				Scrape(movie, "Movie.Jsonl")
			End If
		Next Divs
	End Sub
End Class
VB   C#

最新消息

  1. 我们实现了 Movie 类以保存我们抓取的数据

  2. 我们将电影对象传递给 Scrape 方法,它可以理解我们的格式并以定义的格式保存,如我们在此处所见:

讓我們開始抓取更詳細的頁面。

電影頁面看起來像這樣:

<div class="mvi-content">
    <div class="thumb mvic-thumb"
         style="background-image: url(https://img.gocdn.online/2017/04/28/poster/5a08e94ba02118f22dc30f298c603210-guardians-of-the-galaxy-vol-2.jpg);"></div>
    <div class="mvic-desc">
        <h3>Guardians of the Galaxy Vol. 2</h3>        
        <div class="desc">
            Set to the backdrop of Awesome Mixtape #2, Marvel's Guardians of the Galaxy Vol. 2 continues the team's adventures as they travel throughout the cosmos to help Peter Quill learn more about his true parentage.
        </div>
        <div class="mvic-info">
            <div class="mvici-left">
                <p>
                    <strong>Genre: </strong>
                    <a href="https://Domain/genre/action/" title="Action">Action</a>,
                    <a href="https://Domain/genre/adventure/" title="Adventure">Adventure</a>,
                    <a href="https://Domain/genre/sci-fi/" title="Sci-Fi">Sci-Fi</a>
                </p>
                <p>
                    <strong>Actor: </strong>
                    <a target="_blank" href="https://Domain/actor/chris-pratt" title="Chris Pratt">Chris Pratt</a>,
                    <a target="_blank" href="https://Domain/actor/-zoe-saldana" title="Zoe Saldana">Zoe Saldana</a>,
                    <a target="_blank" href="https://Domain/actor/-dave-bautista-" title="Dave Bautista">Dave Bautista</a>
                </p>
                <p>
                    <strong>Director: </strong>
                    <a href="#" title="James Gunn">James Gunn</a>
                </p>
                <p>
                    <strong>Country: </strong>
                    <a href="https://Domain/country/us" title="United States">United States</a>
                </p>
            </div>
            <div class="mvici-right">
                <p><strong>Duration:</strong> 136 min</p>
                <p><strong>Quality:</strong> <span class="quality">CAM</span></p>
                <p><strong>Release:</strong> 2017</p>
                <p><strong>IMDb:</strong> 8.3</p>
            </div>
            <div class="clearfix"></div>
        </div>
        <div class="clearfix"></div>
    </div>
    <div class="clearfix"></div>
</div>
<div class="mvi-content">
    <div class="thumb mvic-thumb"
         style="background-image: url(https://img.gocdn.online/2017/04/28/poster/5a08e94ba02118f22dc30f298c603210-guardians-of-the-galaxy-vol-2.jpg);"></div>
    <div class="mvic-desc">
        <h3>Guardians of the Galaxy Vol. 2</h3>        
        <div class="desc">
            Set to the backdrop of Awesome Mixtape #2, Marvel's Guardians of the Galaxy Vol. 2 continues the team's adventures as they travel throughout the cosmos to help Peter Quill learn more about his true parentage.
        </div>
        <div class="mvic-info">
            <div class="mvici-left">
                <p>
                    <strong>Genre: </strong>
                    <a href="https://Domain/genre/action/" title="Action">Action</a>,
                    <a href="https://Domain/genre/adventure/" title="Adventure">Adventure</a>,
                    <a href="https://Domain/genre/sci-fi/" title="Sci-Fi">Sci-Fi</a>
                </p>
                <p>
                    <strong>Actor: </strong>
                    <a target="_blank" href="https://Domain/actor/chris-pratt" title="Chris Pratt">Chris Pratt</a>,
                    <a target="_blank" href="https://Domain/actor/-zoe-saldana" title="Zoe Saldana">Zoe Saldana</a>,
                    <a target="_blank" href="https://Domain/actor/-dave-bautista-" title="Dave Bautista">Dave Bautista</a>
                </p>
                <p>
                    <strong>Director: </strong>
                    <a href="#" title="James Gunn">James Gunn</a>
                </p>
                <p>
                    <strong>Country: </strong>
                    <a href="https://Domain/country/us" title="United States">United States</a>
                </p>
            </div>
            <div class="mvici-right">
                <p><strong>Duration:</strong> 136 min</p>
                <p><strong>Quality:</strong> <span class="quality">CAM</span></p>
                <p><strong>Release:</strong> 2017</p>
                <p><strong>IMDb:</strong> 8.3</p>
            </div>
            <div class="clearfix"></div>
        </div>
        <div class="clearfix"></div>
    </div>
    <div class="clearfix"></div>
</div>
HTML

我們可以為我們的電影類別擴展新的屬性 (描述, 類型, 演員, 導演, 國家, 片長, IMDB 評分) 但我們將使用 (描述, 類型, 演員) 僅供我們的範例使用。

public class Movie
{
    public int Id { get; set; }
    public string Title { get; set; }
    public string URL { get; set; }
    public string Description { get; set; }
    public List<string> Genre { get; set; }
    public List<string> Actor { get; set; }

}
public class Movie
{
    public int Id { get; set; }
    public string Title { get; set; }
    public string URL { get; set; }
    public string Description { get; set; }
    public List<string> Genre { get; set; }
    public List<string> Actor { get; set; }

}
Public Class Movie
	Public Property Id() As Integer
	Public Property Title() As String
	Public Property URL() As String
	Public Property Description() As String
	Public Property Genre() As List(Of String)
	Public Property Actor() As List(Of String)

End Class
VB   C#

現在我們將導航到詳細頁面以進行抓取。

IronWebScraper讓您可以在抓取功能中添加更多內容,以抓取不同類型的頁面格式。

如我們所見:

public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("https://domain/", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var movie = new Movie();
                movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
                var link = Divs.Css("a")[0];
                movie.Title = link.TextContentClean;
                movie.URL = link.Attributes ["href"];
                this.Request(movie.URL, ParseDetails, new MetaData() { { "movie", movie } });// to scrap Detailed Page
            }
        }           
    }
    public void ParseDetails(Response response)
    {
        var movie = response.MetaData.Get<Movie>("movie");
        var Div = response.Css("div.mvic-desc")[0];
        movie.Description = Div.Css("div.desc")[0].TextContentClean;
        foreach(var Genre in Div.Css("div > p > a"))
        {
            movie.Genre.Add(Genre.TextContentClean);
        }
        foreach (var Actor in Div.Css("div > p:nth-child(2) > a"))
        {
            movie.Actor.Add(Actor.TextContentClean);
        }
        Scrape(movie, "Movie.Jsonl");
    }
}
public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("https://domain/", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var movie = new Movie();
                movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
                var link = Divs.Css("a")[0];
                movie.Title = link.TextContentClean;
                movie.URL = link.Attributes ["href"];
                this.Request(movie.URL, ParseDetails, new MetaData() { { "movie", movie } });// to scrap Detailed Page
            }
        }           
    }
    public void ParseDetails(Response response)
    {
        var movie = response.MetaData.Get<Movie>("movie");
        var Div = response.Css("div.mvic-desc")[0];
        movie.Description = Div.Css("div.desc")[0].TextContentClean;
        foreach(var Genre in Div.Css("div > p > a"))
        {
            movie.Genre.Add(Genre.TextContentClean);
        }
        foreach (var Actor in Div.Css("div > p:nth-child(2) > a"))
        {
            movie.Actor.Add(Actor.TextContentClean);
        }
        Scrape(movie, "Movie.Jsonl");
    }
}
Public Class MovieScraper
	Inherits WebScraper

	Public Overrides Sub Init()
		License.LicenseKey = "LicenseKey"
		Me.LoggingLevel = WebScraper.LogLevel.All
		Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
		Me.Request("https://domain/", AddressOf Parse)
	End Sub
	Public Overrides Sub Parse(ByVal response As Response)
		For Each Divs In response.Css("#movie-featured > div")
			If Divs.Attributes ("class") <> "clearfix" Then
				Dim movie As New Movie()
				movie.Id = Convert.ToInt32(Divs.GetAttribute("data-movie-id"))
				Dim link = Divs.Css("a")(0)
				movie.Title = link.TextContentClean
				movie.URL = link.Attributes ("href")
				Me.Request(movie.URL, AddressOf ParseDetails, New MetaData() From {
					{ "movie", movie }
				}) ' to scrap Detailed Page
			End If
		Next Divs
	End Sub
	Public Sub ParseDetails(ByVal response As Response)
		Dim movie = response.MetaData.Get(Of Movie)("movie")
		Dim Div = response.Css("div.mvic-desc")(0)
		movie.Description = Div.Css("div.desc")(0).TextContentClean
		For Each Genre In Div.Css("div > p > a")
			movie.Genre.Add(Genre.TextContentClean)
		Next Genre
		For Each Actor In Div.Css("div > p:nth-child(2) > a")
			movie.Actor.Add(Actor.TextContentClean)
		Next Actor
		Scrape(movie, "Movie.Jsonl")
	End Sub
End Class
VB   C#

最新消息

  1. 我們可以新增爬取功能 (解析詳細資訊) 爬取詳細頁面

  2. 我們將生成文件的 Scrape 函數移至新函數

  3. 我們使用了 IronWebScraper 功能 (元數據) 將我們的電影物件傳遞給新的爬取函數

  4. 我們已經抓取了頁面並將我們的電影物件數據保存到文件中