オンライン映画サイトのスクレイピング
現実世界のウェブサイトから別の例を始めよう。映画のウェブサイトをスクレイピングしてみよう。
新しいクラスを追加し、名前を「MovieScraper」としよう:
では、スクレイピングするサイトを見てみよう:
これはホームページのHTMLの一部である:
<div id="movie-featured" class="movies-list movies-list-full tab-pane in fade active">
<div data-movie-id="20746" class="ml-item">
<a href="https://website.com/film/king-arthur-legend-of-the-sword-20746/">
<span class="mli-quality">CAM</span>
<img data-original="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg"
class="lazy thumb mli-thumb" alt="King Arthur: Legend of the Sword"
src="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg"
style="display: inline-block;">
<span class="mli-info"><h2>King Arthur: Legend of the Sword</h2></span>
</a>
</div>
<div data-movie-id="20724" class="ml-item">
<a href="https://website.com/film/snatched-20724/" >
<span class="mli-quality">CAM</span>
<img data-original="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg"
class="lazy thumb mli-thumb" alt="Snatched"
src="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg"
style="display: inline-block;">
<span class="mli-info"><h2>Snatched</h2></span>
</a>
</div>
</div>
<div id="movie-featured" class="movies-list movies-list-full tab-pane in fade active">
<div data-movie-id="20746" class="ml-item">
<a href="https://website.com/film/king-arthur-legend-of-the-sword-20746/">
<span class="mli-quality">CAM</span>
<img data-original="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg"
class="lazy thumb mli-thumb" alt="King Arthur: Legend of the Sword"
src="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg"
style="display: inline-block;">
<span class="mli-info"><h2>King Arthur: Legend of the Sword</h2></span>
</a>
</div>
<div data-movie-id="20724" class="ml-item">
<a href="https://website.com/film/snatched-20724/" >
<span class="mli-quality">CAM</span>
<img data-original="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg"
class="lazy thumb mli-thumb" alt="Snatched"
src="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg"
style="display: inline-block;">
<span class="mli-info"><h2>Snatched</h2></span>
</a>
</div>
</div>
ご覧のように、ムービーID、タイトル、詳細ページへのリンクがあります。
このデータ一式をスクレイピングしてみよう:
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("www.website.com", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var MovieId = Divs.GetAttribute("data-movie-id");
var link = Divs.Css("a")[0];
var MovieTitle = link.TextContentClean;
Scrape(new ScrapedData() { { "MovieId", MovieId }, { "MovieTitle", MovieTitle } }, "Movie.Jsonl");
}
}
}
}
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("www.website.com", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var MovieId = Divs.GetAttribute("data-movie-id");
var link = Divs.Css("a")[0];
var MovieTitle = link.TextContentClean;
Scrape(new ScrapedData() { { "MovieId", MovieId }, { "MovieTitle", MovieTitle } }, "Movie.Jsonl");
}
}
}
}
Public Class MovieScraper
Inherits WebScraper
Public Overrides Sub Init()
License.LicenseKey = "LicenseKey"
Me.LoggingLevel = WebScraper.LogLevel.All
Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
Me.Request("www.website.com", AddressOf Parse)
End Sub
Public Overrides Sub Parse(ByVal response As Response)
For Each Divs In response.Css("#movie-featured > div")
If Divs.Attributes ("class") <> "clearfix" Then
Dim MovieId = Divs.GetAttribute("data-movie-id")
Dim link = Divs.Css("a")(0)
Dim MovieTitle = link.TextContentClean
Scrape(New ScrapedData() From {
{ "MovieId", MovieId },
{ "MovieTitle", MovieTitle }
},
"Movie.Jsonl")
End If
Next Divs
End Sub
End Class
*このコードで何が新しくなったのか?
Working Directoryプロパティは、すべてのスクレイピングされたデータとその関連ファイルのためのメイン作業ディレクトリを設定するために使用されます。
*もっとやろう。
スクレイピングされたデータをフォーマットされたオブジェクトに格納するために、型付きオブジェクトを構築する必要があるとしたら?
フォーマットされたデータを保持するムービー・クラスを実装しよう:
public class Movie
{
public int Id { get; set; }
public string Title { get; set; }
public string URL { get; set; }
}
public class Movie
{
public int Id { get; set; }
public string Title { get; set; }
public string URL { get; set; }
}
Public Class Movie
Public Property Id() As Integer
Public Property Title() As String
Public Property URL() As String
End Class
では、コードを更新しよう:
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("https://website.com/", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var movie = new Movie();
movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
var link = Divs.Css("a")[0];
movie.Title = link.TextContentClean;
movie.URL = link.Attributes ["href"];
Scrape(movie, "Movie.Jsonl");
}
}
}
}
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("https://website.com/", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var movie = new Movie();
movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
var link = Divs.Css("a")[0];
movie.Title = link.TextContentClean;
movie.URL = link.Attributes ["href"];
Scrape(movie, "Movie.Jsonl");
}
}
}
}
Public Class MovieScraper
Inherits WebScraper
Public Overrides Sub Init()
License.LicenseKey = "LicenseKey"
Me.LoggingLevel = WebScraper.LogLevel.All
Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
Me.Request("https://website.com/", AddressOf Parse)
End Sub
Public Overrides Sub Parse(ByVal response As Response)
For Each Divs In response.Css("#movie-featured > div")
If Divs.Attributes ("class") <> "clearfix" Then
Dim movie As New Movie()
movie.Id = Convert.ToInt32(Divs.GetAttribute("data-movie-id"))
Dim link = Divs.Css("a")(0)
movie.Title = link.TextContentClean
movie.URL = link.Attributes ("href")
Scrape(movie, "Movie.Jsonl")
End If
Next Divs
End Sub
End Class
*何が新しいのか?
-
スクレイピングされたデータを保持するためにムービークラスを実装します。
- ムービーオブジェクトをスクレイプ・メソッドに渡すと、スクレイプ・メソッドは我々のフォーマットを理解し、ここにあるように定義されたフォーマットで保存する:
より詳細なページのスクレイピングを始めよう。
ムービーページはこんな感じ:
<div class="mvi-content">
<div class="thumb mvic-thumb"
style="background-image: url(https://img.gocdn.online/2017/04/28/poster/5a08e94ba02118f22dc30f298c603210-guardians-of-the-galaxy-vol-2.jpg);"></div>
<div class="mvic-desc">
<h3>Guardians of the Galaxy Vol. 2</h3>
<div class="desc">
Set to the backdrop of Awesome Mixtape #2, Marvel's Guardians of the Galaxy Vol. 2 continues the team's adventures as they travel throughout the cosmos to help Peter Quill learn more about his true parentage.
</div>
<div class="mvic-info">
<div class="mvici-left">
<p>
<strong>Genre: </strong>
<a href="https://Domain/genre/action/" title="Action">Action</a>,
<a href="https://Domain/genre/adventure/" title="Adventure">Adventure</a>,
<a href="https://Domain/genre/sci-fi/" title="Sci-Fi">Sci-Fi</a>
</p>
<p>
<strong>Actor: </strong>
<a target="_blank" href="https://Domain/actor/chris-pratt" title="Chris Pratt">Chris Pratt</a>,
<a target="_blank" href="https://Domain/actor/-zoe-saldana" title="Zoe Saldana">Zoe Saldana</a>,
<a target="_blank" href="https://Domain/actor/-dave-bautista-" title="Dave Bautista">Dave Bautista</a>
</p>
<p>
<strong>Director: </strong>
<a href="#" title="James Gunn">James Gunn</a>
</p>
<p>
<strong>Country: </strong>
<a href="https://Domain/country/us" title="United States">United States</a>
</p>
</div>
<div class="mvici-right">
<p><strong>Duration:</strong> 136 min</p>
<p><strong>Quality:</strong> <span class="quality">CAM</span></p>
<p><strong>Release:</strong> 2017</p>
<p><strong>IMDb:</strong> 8.3</p>
</div>
<div class="clearfix"></div>
</div>
<div class="clearfix"></div>
</div>
<div class="clearfix"></div>
</div>
<div class="mvi-content">
<div class="thumb mvic-thumb"
style="background-image: url(https://img.gocdn.online/2017/04/28/poster/5a08e94ba02118f22dc30f298c603210-guardians-of-the-galaxy-vol-2.jpg);"></div>
<div class="mvic-desc">
<h3>Guardians of the Galaxy Vol. 2</h3>
<div class="desc">
Set to the backdrop of Awesome Mixtape #2, Marvel's Guardians of the Galaxy Vol. 2 continues the team's adventures as they travel throughout the cosmos to help Peter Quill learn more about his true parentage.
</div>
<div class="mvic-info">
<div class="mvici-left">
<p>
<strong>Genre: </strong>
<a href="https://Domain/genre/action/" title="Action">Action</a>,
<a href="https://Domain/genre/adventure/" title="Adventure">Adventure</a>,
<a href="https://Domain/genre/sci-fi/" title="Sci-Fi">Sci-Fi</a>
</p>
<p>
<strong>Actor: </strong>
<a target="_blank" href="https://Domain/actor/chris-pratt" title="Chris Pratt">Chris Pratt</a>,
<a target="_blank" href="https://Domain/actor/-zoe-saldana" title="Zoe Saldana">Zoe Saldana</a>,
<a target="_blank" href="https://Domain/actor/-dave-bautista-" title="Dave Bautista">Dave Bautista</a>
</p>
<p>
<strong>Director: </strong>
<a href="#" title="James Gunn">James Gunn</a>
</p>
<p>
<strong>Country: </strong>
<a href="https://Domain/country/us" title="United States">United States</a>
</p>
</div>
<div class="mvici-right">
<p><strong>Duration:</strong> 136 min</p>
<p><strong>Quality:</strong> <span class="quality">CAM</span></p>
<p><strong>Release:</strong> 2017</p>
<p><strong>IMDb:</strong> 8.3</p>
</div>
<div class="clearfix"></div>
</div>
<div class="clearfix"></div>
</div>
<div class="clearfix"></div>
</div>
新しいプロパティでムービークラスを拡張することができます。(概要, ジャンル, 俳優, 監督, 国, 収録時間, IMDBスコア)しかし、ここでは(解説, ジャンル, 俳優)のみである。
public class Movie
{
public int Id { get; set; }
public string Title { get; set; }
public string URL { get; set; }
public string Description { get; set; }
public List<string> Genre { get; set; }
public List<string> Actor { get; set; }
}
public class Movie
{
public int Id { get; set; }
public string Title { get; set; }
public string URL { get; set; }
public string Description { get; set; }
public List<string> Genre { get; set; }
public List<string> Actor { get; set; }
}
Public Class Movie
Public Property Id() As Integer
Public Property Title() As String
Public Property URL() As String
Public Property Description() As String
Public Property Genre() As List(Of String)
Public Property Actor() As List(Of String)
End Class
では、詳細ページに移動してスクレイピングしてみよう。
IronWebscraperでは、スクレイピング機能を追加することで、様々なタイプのページフォーマットをスクレイピングすることができる。
ここにあるように:
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("https://domain/", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var movie = new Movie();
movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
var link = Divs.Css("a")[0];
movie.Title = link.TextContentClean;
movie.URL = link.Attributes ["href"];
this.Request(movie.URL, ParseDetails, new MetaData() { { "movie", movie } });// to scrap Detailed Page
}
}
}
public void ParseDetails(Response response)
{
var movie = response.MetaData.Get<Movie>("movie");
var Div = response.Css("div.mvic-desc")[0];
movie.Description = Div.Css("div.desc")[0].TextContentClean;
foreach(var Genre in Div.Css("div > p > a"))
{
movie.Genre.Add(Genre.TextContentClean);
}
foreach (var Actor in Div.Css("div > p:nth-child(2) > a"))
{
movie.Actor.Add(Actor.TextContentClean);
}
Scrape(movie, "Movie.Jsonl");
}
}
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("https://domain/", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var movie = new Movie();
movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
var link = Divs.Css("a")[0];
movie.Title = link.TextContentClean;
movie.URL = link.Attributes ["href"];
this.Request(movie.URL, ParseDetails, new MetaData() { { "movie", movie } });// to scrap Detailed Page
}
}
}
public void ParseDetails(Response response)
{
var movie = response.MetaData.Get<Movie>("movie");
var Div = response.Css("div.mvic-desc")[0];
movie.Description = Div.Css("div.desc")[0].TextContentClean;
foreach(var Genre in Div.Css("div > p > a"))
{
movie.Genre.Add(Genre.TextContentClean);
}
foreach (var Actor in Div.Css("div > p:nth-child(2) > a"))
{
movie.Actor.Add(Actor.TextContentClean);
}
Scrape(movie, "Movie.Jsonl");
}
}
Public Class MovieScraper
Inherits WebScraper
Public Overrides Sub Init()
License.LicenseKey = "LicenseKey"
Me.LoggingLevel = WebScraper.LogLevel.All
Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
Me.Request("https://domain/", AddressOf Parse)
End Sub
Public Overrides Sub Parse(ByVal response As Response)
For Each Divs In response.Css("#movie-featured > div")
If Divs.Attributes ("class") <> "clearfix" Then
Dim movie As New Movie()
movie.Id = Convert.ToInt32(Divs.GetAttribute("data-movie-id"))
Dim link = Divs.Css("a")(0)
movie.Title = link.TextContentClean
movie.URL = link.Attributes ("href")
Me.Request(movie.URL, AddressOf ParseDetails, New MetaData() From {
{ "movie", movie }
}) ' to scrap Detailed Page
End If
Next Divs
End Sub
Public Sub ParseDetails(ByVal response As Response)
Dim movie = response.MetaData.Get(Of Movie)("movie")
Dim Div = response.Css("div.mvic-desc")(0)
movie.Description = Div.Css("div.desc")(0).TextContentClean
For Each Genre In Div.Css("div > p > a")
movie.Genre.Add(Genre.TextContentClean)
Next Genre
For Each Actor In Div.Css("div > p:nth-child(2) > a")
movie.Actor.Add(Actor.TextContentClean)
Next Actor
Scrape(movie, "Movie.Jsonl")
End Sub
End Class
*何が新しいのか?