搜索在线电影网站

This article was translated from English: Does it need improvement?
Translated
View the article in English

让我们从现实世界的网站开始另一个例子。我们将选择搜索一个电影网站。

让我们添加一个新类,并将其命名为 "MovieScraper":

现在,让我们来看看我们要搜索的网站:

这是我们在网站上看到的主页 HTML 的一部分:

<div id="movie-featured" class="movies-list movies-list-full tab-pane in fade active">
    <div data-movie-id="20746" class="ml-item">
        <a href="https://website.com/film/king-arthur-legend-of-the-sword-20746/">
            <span class="mli-quality">CAM</span>
            <img data-original="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg" 
                 class="lazy thumb mli-thumb" alt="King Arthur: Legend of the Sword"
                  src="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg" 
                 style="display: inline-block;">
            <span class="mli-info"><h2>King Arthur: Legend of the Sword</h2></span>
        </a>
    </div>
    <div data-movie-id="20724" class="ml-item">
        <a href="https://website.com/film/snatched-20724/" >
            <span class="mli-quality">CAM</span>
            <img data-original="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg" 
                 class="lazy thumb mli-thumb" alt="Snatched" 
                 src="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg" 
                 style="display: inline-block;">
            <span class="mli-info"><h2>Snatched</h2></span>
        </a>
    </div>
</div>
<div id="movie-featured" class="movies-list movies-list-full tab-pane in fade active">
    <div data-movie-id="20746" class="ml-item">
        <a href="https://website.com/film/king-arthur-legend-of-the-sword-20746/">
            <span class="mli-quality">CAM</span>
            <img data-original="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg" 
                 class="lazy thumb mli-thumb" alt="King Arthur: Legend of the Sword"
                  src="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg" 
                 style="display: inline-block;">
            <span class="mli-info"><h2>King Arthur: Legend of the Sword</h2></span>
        </a>
    </div>
    <div data-movie-id="20724" class="ml-item">
        <a href="https://website.com/film/snatched-20724/" >
            <span class="mli-quality">CAM</span>
            <img data-original="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg" 
                 class="lazy thumb mli-thumb" alt="Snatched" 
                 src="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg" 
                 style="display: inline-block;">
            <span class="mli-info"><h2>Snatched</h2></span>
        </a>
    </div>
</div>
HTML

正如我们所看到的,我们有一个电影 ID、标题和详细页面链接。

让我们开始扫描这组数据:

public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("www.website.com", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var MovieId = Divs.GetAttribute("data-movie-id");
                var link = Divs.Css("a")[0];
                var MovieTitle = link.TextContentClean;
                Scrape(new ScrapedData() { { "MovieId", MovieId }, { "MovieTitle", MovieTitle } }, "Movie.Jsonl");
            }
        }           
    }
}
public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("www.website.com", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var MovieId = Divs.GetAttribute("data-movie-id");
                var link = Divs.Css("a")[0];
                var MovieTitle = link.TextContentClean;
                Scrape(new ScrapedData() { { "MovieId", MovieId }, { "MovieTitle", MovieTitle } }, "Movie.Jsonl");
            }
        }           
    }
}
Public Class MovieScraper
	Inherits WebScraper

	Public Overrides Sub Init()
		License.LicenseKey = "LicenseKey"
		Me.LoggingLevel = WebScraper.LogLevel.All
		Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
		Me.Request("www.website.com", AddressOf Parse)
	End Sub
	Public Overrides Sub Parse(ByVal response As Response)
		For Each Divs In response.Css("#movie-featured > div")
			If Divs.Attributes ("class") <> "clearfix" Then
				Dim MovieId = Divs.GetAttribute("data-movie-id")
				Dim link = Divs.Css("a")(0)
				Dim MovieTitle = link.TextContentClean
				Scrape(New ScrapedData() From {
					{ "MovieId", MovieId },
					{ "MovieTitle", MovieTitle }
				},
				"Movie.Jsonl")
			End If
		Next Divs
	End Sub
End Class
VB   C#

*本代码有何新内容?

工作目录属性用于设置所有刮擦数据及其相关文件的主要工作目录。

*让我们再接再厉。

如果我们需要构建类型化对象,以格式化对象的形式保存刮擦数据,该怎么办?

让我们来实现一个影片类,它将保存我们的格式化数据:

public class Movie
{
    public int Id { get; set; }
    public string Title { get; set; }
    public string URL { get; set; }

}
public class Movie
{
    public int Id { get; set; }
    public string Title { get; set; }
    public string URL { get; set; }

}
Public Class Movie
	Public Property Id() As Integer
	Public Property Title() As String
	Public Property URL() As String

End Class
VB   C#

现在,我们将更新代码:

public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("https://website.com/", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var movie = new Movie();
                movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
                var link = Divs.Css("a")[0];
                movie.Title = link.TextContentClean;
                movie.URL = link.Attributes ["href"];
                Scrape(movie, "Movie.Jsonl");
            }
        }
    }
}
public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("https://website.com/", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var movie = new Movie();
                movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
                var link = Divs.Css("a")[0];
                movie.Title = link.TextContentClean;
                movie.URL = link.Attributes ["href"];
                Scrape(movie, "Movie.Jsonl");
            }
        }
    }
}
Public Class MovieScraper
	Inherits WebScraper

	Public Overrides Sub Init()
		License.LicenseKey = "LicenseKey"
		Me.LoggingLevel = WebScraper.LogLevel.All
		Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
		Me.Request("https://website.com/", AddressOf Parse)
	End Sub
	Public Overrides Sub Parse(ByVal response As Response)
		For Each Divs In response.Css("#movie-featured > div")
			If Divs.Attributes ("class") <> "clearfix" Then
				Dim movie As New Movie()
				movie.Id = Convert.ToInt32(Divs.GetAttribute("data-movie-id"))
				Dim link = Divs.Css("a")(0)
				movie.Title = link.TextContentClean
				movie.URL = link.Attributes ("href")
				Scrape(movie, "Movie.Jsonl")
			End If
		Next Divs
	End Sub
End Class
VB   C#

*有什么新变化?

1.我们实现了 Movie 类来保存我们的刮擦数据

2.我们将影片对象传递给 Scrape 方法,它就会理解我们的格式,并以定义的格式保存,如图所示:

让我们开始扫描更详细的页面。

电影页面看起来是这样的

<div class="mvi-content">
    <div class="thumb mvic-thumb"
         style="background-image: url(https://img.gocdn.online/2017/04/28/poster/5a08e94ba02118f22dc30f298c603210-guardians-of-the-galaxy-vol-2.jpg);"></div>
    <div class="mvic-desc">
        <h3>Guardians of the Galaxy Vol. 2</h3>        
        <div class="desc">
            Set to the backdrop of Awesome Mixtape #2, Marvel's Guardians of the Galaxy Vol. 2 continues the team's adventures as they travel throughout the cosmos to help Peter Quill learn more about his true parentage.
        </div>
        <div class="mvic-info">
            <div class="mvici-left">
                <p>
                    <strong>Genre: </strong>
                    <a href="https://Domain/genre/action/" title="Action">Action</a>,
                    <a href="https://Domain/genre/adventure/" title="Adventure">Adventure</a>,
                    <a href="https://Domain/genre/sci-fi/" title="Sci-Fi">Sci-Fi</a>
                </p>
                <p>
                    <strong>Actor: </strong>
                    <a target="_blank" href="https://Domain/actor/chris-pratt" title="Chris Pratt">Chris Pratt</a>,
                    <a target="_blank" href="https://Domain/actor/-zoe-saldana" title="Zoe Saldana">Zoe Saldana</a>,
                    <a target="_blank" href="https://Domain/actor/-dave-bautista-" title="Dave Bautista">Dave Bautista</a>
                </p>
                <p>
                    <strong>Director: </strong>
                    <a href="#" title="James Gunn">James Gunn</a>
                </p>
                <p>
                    <strong>Country: </strong>
                    <a href="https://Domain/country/us" title="United States">United States</a>
                </p>
            </div>
            <div class="mvici-right">
                <p><strong>Duration:</strong> 136 min</p>
                <p><strong>Quality:</strong> <span class="quality">CAM</span></p>
                <p><strong>Release:</strong> 2017</p>
                <p><strong>IMDb:</strong> 8.3</p>
            </div>
            <div class="clearfix"></div>
        </div>
        <div class="clearfix"></div>
    </div>
    <div class="clearfix"></div>
</div>
<div class="mvi-content">
    <div class="thumb mvic-thumb"
         style="background-image: url(https://img.gocdn.online/2017/04/28/poster/5a08e94ba02118f22dc30f298c603210-guardians-of-the-galaxy-vol-2.jpg);"></div>
    <div class="mvic-desc">
        <h3>Guardians of the Galaxy Vol. 2</h3>        
        <div class="desc">
            Set to the backdrop of Awesome Mixtape #2, Marvel's Guardians of the Galaxy Vol. 2 continues the team's adventures as they travel throughout the cosmos to help Peter Quill learn more about his true parentage.
        </div>
        <div class="mvic-info">
            <div class="mvici-left">
                <p>
                    <strong>Genre: </strong>
                    <a href="https://Domain/genre/action/" title="Action">Action</a>,
                    <a href="https://Domain/genre/adventure/" title="Adventure">Adventure</a>,
                    <a href="https://Domain/genre/sci-fi/" title="Sci-Fi">Sci-Fi</a>
                </p>
                <p>
                    <strong>Actor: </strong>
                    <a target="_blank" href="https://Domain/actor/chris-pratt" title="Chris Pratt">Chris Pratt</a>,
                    <a target="_blank" href="https://Domain/actor/-zoe-saldana" title="Zoe Saldana">Zoe Saldana</a>,
                    <a target="_blank" href="https://Domain/actor/-dave-bautista-" title="Dave Bautista">Dave Bautista</a>
                </p>
                <p>
                    <strong>Director: </strong>
                    <a href="#" title="James Gunn">James Gunn</a>
                </p>
                <p>
                    <strong>Country: </strong>
                    <a href="https://Domain/country/us" title="United States">United States</a>
                </p>
            </div>
            <div class="mvici-right">
                <p><strong>Duration:</strong> 136 min</p>
                <p><strong>Quality:</strong> <span class="quality">CAM</span></p>
                <p><strong>Release:</strong> 2017</p>
                <p><strong>IMDb:</strong> 8.3</p>
            </div>
            <div class="clearfix"></div>
        </div>
        <div class="clearfix"></div>
    </div>
    <div class="clearfix"></div>
</div>
HTML

我们可以用新的属性来扩展我们的影片类 (说明, 类型, 演员, 导演, 国家, 片长, IMDB评分) 但我们将使用 (描述、类型、演员) 仅适用于我们的样本。

public class Movie
{
    public int Id { get; set; }
    public string Title { get; set; }
    public string URL { get; set; }
    public string Description { get; set; }
    public List<string> Genre { get; set; }
    public List<string> Actor { get; set; }

}
public class Movie
{
    public int Id { get; set; }
    public string Title { get; set; }
    public string URL { get; set; }
    public string Description { get; set; }
    public List<string> Genre { get; set; }
    public List<string> Actor { get; set; }

}
Public Class Movie
	Public Property Id() As Integer
	Public Property Title() As String
	Public Property URL() As String
	Public Property Description() As String
	Public Property Genre() As List(Of String)
	Public Property Actor() As List(Of String)

End Class
VB   C#

现在,我们将导航到 "详细 "页面,对其进行抓取。

IronWebScraper 可以让你添加更多的刮擦功能,以刮擦不同类型的页面格式

如图所示

public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("https://domain/", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var movie = new Movie();
                movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
                var link = Divs.Css("a")[0];
                movie.Title = link.TextContentClean;
                movie.URL = link.Attributes ["href"];
                this.Request(movie.URL, ParseDetails, new MetaData() { { "movie", movie } });// to scrap Detailed Page
            }
        }           
    }
    public void ParseDetails(Response response)
    {
        var movie = response.MetaData.Get<Movie>("movie");
        var Div = response.Css("div.mvic-desc")[0];
        movie.Description = Div.Css("div.desc")[0].TextContentClean;
        foreach(var Genre in Div.Css("div > p > a"))
        {
            movie.Genre.Add(Genre.TextContentClean);
        }
        foreach (var Actor in Div.Css("div > p:nth-child(2) > a"))
        {
            movie.Actor.Add(Actor.TextContentClean);
        }
        Scrape(movie, "Movie.Jsonl");
    }
}
public class MovieScraper : WebScraper
{
    public override void Init()
    {
        License.LicenseKey = "LicenseKey";
        this.LoggingLevel = WebScraper.LogLevel.All;
        this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
        this.Request("https://domain/", Parse);
    }
    public override void Parse(Response response)
    {
        foreach (var Divs in response.Css("#movie-featured > div"))
        {
            if (Divs.Attributes ["class"] != "clearfix")
            {
                var movie = new Movie();
                movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
                var link = Divs.Css("a")[0];
                movie.Title = link.TextContentClean;
                movie.URL = link.Attributes ["href"];
                this.Request(movie.URL, ParseDetails, new MetaData() { { "movie", movie } });// to scrap Detailed Page
            }
        }           
    }
    public void ParseDetails(Response response)
    {
        var movie = response.MetaData.Get<Movie>("movie");
        var Div = response.Css("div.mvic-desc")[0];
        movie.Description = Div.Css("div.desc")[0].TextContentClean;
        foreach(var Genre in Div.Css("div > p > a"))
        {
            movie.Genre.Add(Genre.TextContentClean);
        }
        foreach (var Actor in Div.Css("div > p:nth-child(2) > a"))
        {
            movie.Actor.Add(Actor.TextContentClean);
        }
        Scrape(movie, "Movie.Jsonl");
    }
}
Public Class MovieScraper
	Inherits WebScraper

	Public Overrides Sub Init()
		License.LicenseKey = "LicenseKey"
		Me.LoggingLevel = WebScraper.LogLevel.All
		Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
		Me.Request("https://domain/", AddressOf Parse)
	End Sub
	Public Overrides Sub Parse(ByVal response As Response)
		For Each Divs In response.Css("#movie-featured > div")
			If Divs.Attributes ("class") <> "clearfix" Then
				Dim movie As New Movie()
				movie.Id = Convert.ToInt32(Divs.GetAttribute("data-movie-id"))
				Dim link = Divs.Css("a")(0)
				movie.Title = link.TextContentClean
				movie.URL = link.Attributes ("href")
				Me.Request(movie.URL, AddressOf ParseDetails, New MetaData() From {
					{ "movie", movie }
				}) ' to scrap Detailed Page
			End If
		Next Divs
	End Sub
	Public Sub ParseDetails(ByVal response As Response)
		Dim movie = response.MetaData.Get(Of Movie)("movie")
		Dim Div = response.Css("div.mvic-desc")(0)
		movie.Description = Div.Css("div.desc")(0).TextContentClean
		For Each Genre In Div.Css("div > p > a")
			movie.Genre.Add(Genre.TextContentClean)
		Next Genre
		For Each Actor In Div.Css("div > p:nth-child(2) > a")
			movie.Actor.Add(Actor.TextContentClean)
		Next Actor
		Scrape(movie, "Movie.Jsonl")
	End Sub
End Class
VB   C#

*有什么新变化?

1.我们可以添加搜刮功能 (解析细节) 搜索详细页面

2.我们将生成文件的 Scrape 函数移到新函数中

3.我们使用了 IronWebScraper 功能 (元数据) 将我们的影片对象传递给新的刮擦函数

4.我们刮擦了页面,并将电影对象数据保存到文件中