Raspado de un sitio web de películas en línea
Empecemos con otro ejemplo de un sitio web del mundo real. Vamos a elegir para raspar un sitio web de películas.
Añadamos una nueva clase y llamémosla "MovieScraper":
Ahora vamos a echar un vistazo en el sitio que vamos a raspar:
Esto forma parte del HTML de la página de inicio que vemos en el sitio web:
<div id="movie-featured" class="movies-list movies-list-full tab-pane in fade active">
<div data-movie-id="20746" class="ml-item">
<a href="https://website.com/film/king-arthur-legend-of-the-sword-20746/">
<span class="mli-quality">CAM</span>
<img data-original="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg"
class="lazy thumb mli-thumb" alt="King Arthur: Legend of the Sword"
src="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg"
style="display: inline-block;">
<span class="mli-info"><h2>King Arthur: Legend of the Sword</h2></span>
</a>
</div>
<div data-movie-id="20724" class="ml-item">
<a href="https://website.com/film/snatched-20724/" >
<span class="mli-quality">CAM</span>
<img data-original="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg"
class="lazy thumb mli-thumb" alt="Snatched"
src="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg"
style="display: inline-block;">
<span class="mli-info"><h2>Snatched</h2></span>
</a>
</div>
</div>
<div id="movie-featured" class="movies-list movies-list-full tab-pane in fade active">
<div data-movie-id="20746" class="ml-item">
<a href="https://website.com/film/king-arthur-legend-of-the-sword-20746/">
<span class="mli-quality">CAM</span>
<img data-original="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg"
class="lazy thumb mli-thumb" alt="King Arthur: Legend of the Sword"
src="https://img.gocdn.online/2017/05/16/poster/2116d6719c710eabe83b377463230fbe-king-arthur-legend-of-the-sword.jpg"
style="display: inline-block;">
<span class="mli-info"><h2>King Arthur: Legend of the Sword</h2></span>
</a>
</div>
<div data-movie-id="20724" class="ml-item">
<a href="https://website.com/film/snatched-20724/" >
<span class="mli-quality">CAM</span>
<img data-original="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg"
class="lazy thumb mli-thumb" alt="Snatched"
src="https://img.gocdn.online/2017/05/16/poster/5ef66403dc331009bdb5aa37cfe819ba-snatched.jpg"
style="display: inline-block;">
<span class="mli-info"><h2>Snatched</h2></span>
</a>
</div>
</div>
Como podemos ver, tenemos el ID de la película, el Título y el Enlace a la página detallada.
Empecemos a raspar este conjunto de datos:
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("www.website.com", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var MovieId = Divs.GetAttribute("data-movie-id");
var link = Divs.Css("a")[0];
var MovieTitle = link.TextContentClean;
Scrape(new ScrapedData() { { "MovieId", MovieId }, { "MovieTitle", MovieTitle } }, "Movie.Jsonl");
}
}
}
}
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("www.website.com", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var MovieId = Divs.GetAttribute("data-movie-id");
var link = Divs.Css("a")[0];
var MovieTitle = link.TextContentClean;
Scrape(new ScrapedData() { { "MovieId", MovieId }, { "MovieTitle", MovieTitle } }, "Movie.Jsonl");
}
}
}
}
Public Class MovieScraper
Inherits WebScraper
Public Overrides Sub Init()
License.LicenseKey = "LicenseKey"
Me.LoggingLevel = WebScraper.LogLevel.All
Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
Me.Request("www.website.com", AddressOf Parse)
End Sub
Public Overrides Sub Parse(ByVal response As Response)
For Each Divs In response.Css("#movie-featured > div")
If Divs.Attributes ("class") <> "clearfix" Then
Dim MovieId = Divs.GetAttribute("data-movie-id")
Dim link = Divs.Css("a")(0)
Dim MovieTitle = link.TextContentClean
Scrape(New ScrapedData() From {
{ "MovieId", MovieId },
{ "MovieTitle", MovieTitle }
},
"Movie.Jsonl")
End If
Next Divs
End Sub
End Class
¿Qué hay de nuevo en este código?
La propiedad Directorio de trabajo se utiliza para establecer el directorio de trabajo principal para todos los datos raspados y sus archivos relacionados.
Hagamos más.
¿Y si necesitamos construir objetos tipados que contengan datos raspados en objetos formateados?
Vamos a implementar una clase de película que contendrá nuestros datos formateados:
public class Movie
{
public int Id { get; set; }
public string Title { get; set; }
public string URL { get; set; }
}
public class Movie
{
public int Id { get; set; }
public string Title { get; set; }
public string URL { get; set; }
}
Public Class Movie
Public Property Id() As Integer
Public Property Title() As String
Public Property URL() As String
End Class
Ahora actualizaremos nuestro código:
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("https://website.com/", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var movie = new Movie();
movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
var link = Divs.Css("a")[0];
movie.Title = link.TextContentClean;
movie.URL = link.Attributes ["href"];
Scrape(movie, "Movie.Jsonl");
}
}
}
}
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("https://website.com/", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var movie = new Movie();
movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
var link = Divs.Css("a")[0];
movie.Title = link.TextContentClean;
movie.URL = link.Attributes ["href"];
Scrape(movie, "Movie.Jsonl");
}
}
}
}
Public Class MovieScraper
Inherits WebScraper
Public Overrides Sub Init()
License.LicenseKey = "LicenseKey"
Me.LoggingLevel = WebScraper.LogLevel.All
Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
Me.Request("https://website.com/", AddressOf Parse)
End Sub
Public Overrides Sub Parse(ByVal response As Response)
For Each Divs In response.Css("#movie-featured > div")
If Divs.Attributes ("class") <> "clearfix" Then
Dim movie As New Movie()
movie.Id = Convert.ToInt32(Divs.GetAttribute("data-movie-id"))
Dim link = Divs.Css("a")(0)
movie.Title = link.TextContentClean
movie.URL = link.Attributes ("href")
Scrape(movie, "Movie.Jsonl")
End If
Next Divs
End Sub
End Class
¿Qué hay de nuevo?
Implementamos Movie Class para mantener nuestros datos raspados
- Pasamos objetos de película al método Scrape y este entiende nuestro formato y guarda en un formato definido como podemos ver aquí:
Empecemos a raspar una página más detallada.
La página de películas tiene este aspecto:
<div class="mvi-content">
<div class="thumb mvic-thumb"
style="background-image: url(https://img.gocdn.online/2017/04/28/poster/5a08e94ba02118f22dc30f298c603210-guardians-of-the-galaxy-vol-2.jpg);"></div>
<div class="mvic-desc">
<h3>Guardians of the Galaxy Vol. 2</h3>
<div class="desc">
Set to the backdrop of Awesome Mixtape #2, Marvel's Guardians of the Galaxy Vol. 2 continues the team's adventures as they travel throughout the cosmos to help Peter Quill learn more about his true parentage.
</div>
<div class="mvic-info">
<div class="mvici-left">
<p>
<strong>Genre: </strong>
<a href="https://Domain/genre/action/" title="Action">Action</a>,
<a href="https://Domain/genre/adventure/" title="Adventure">Adventure</a>,
<a href="https://Domain/genre/sci-fi/" title="Sci-Fi">Sci-Fi</a>
</p>
<p>
<strong>Actor: </strong>
<a target="_blank" href="https://Domain/actor/chris-pratt" title="Chris Pratt">Chris Pratt</a>,
<a target="_blank" href="https://Domain/actor/-zoe-saldana" title="Zoe Saldana">Zoe Saldana</a>,
<a target="_blank" href="https://Domain/actor/-dave-bautista-" title="Dave Bautista">Dave Bautista</a>
</p>
<p>
<strong>Director: </strong>
<a href="#" title="James Gunn">James Gunn</a>
</p>
<p>
<strong>Country: </strong>
<a href="https://Domain/country/us" title="United States">United States</a>
</p>
</div>
<div class="mvici-right">
<p><strong>Duration:</strong> 136 min</p>
<p><strong>Quality:</strong> <span class="quality">CAM</span></p>
<p><strong>Release:</strong> 2017</p>
<p><strong>IMDb:</strong> 8.3</p>
</div>
<div class="clearfix"></div>
</div>
<div class="clearfix"></div>
</div>
<div class="clearfix"></div>
</div>
<div class="mvi-content">
<div class="thumb mvic-thumb"
style="background-image: url(https://img.gocdn.online/2017/04/28/poster/5a08e94ba02118f22dc30f298c603210-guardians-of-the-galaxy-vol-2.jpg);"></div>
<div class="mvic-desc">
<h3>Guardians of the Galaxy Vol. 2</h3>
<div class="desc">
Set to the backdrop of Awesome Mixtape #2, Marvel's Guardians of the Galaxy Vol. 2 continues the team's adventures as they travel throughout the cosmos to help Peter Quill learn more about his true parentage.
</div>
<div class="mvic-info">
<div class="mvici-left">
<p>
<strong>Genre: </strong>
<a href="https://Domain/genre/action/" title="Action">Action</a>,
<a href="https://Domain/genre/adventure/" title="Adventure">Adventure</a>,
<a href="https://Domain/genre/sci-fi/" title="Sci-Fi">Sci-Fi</a>
</p>
<p>
<strong>Actor: </strong>
<a target="_blank" href="https://Domain/actor/chris-pratt" title="Chris Pratt">Chris Pratt</a>,
<a target="_blank" href="https://Domain/actor/-zoe-saldana" title="Zoe Saldana">Zoe Saldana</a>,
<a target="_blank" href="https://Domain/actor/-dave-bautista-" title="Dave Bautista">Dave Bautista</a>
</p>
<p>
<strong>Director: </strong>
<a href="#" title="James Gunn">James Gunn</a>
</p>
<p>
<strong>Country: </strong>
<a href="https://Domain/country/us" title="United States">United States</a>
</p>
</div>
<div class="mvici-right">
<p><strong>Duration:</strong> 136 min</p>
<p><strong>Quality:</strong> <span class="quality">CAM</span></p>
<p><strong>Release:</strong> 2017</p>
<p><strong>IMDb:</strong> 8.3</p>
</div>
<div class="clearfix"></div>
</div>
<div class="clearfix"></div>
</div>
<div class="clearfix"></div>
</div>
Podemos ampliar nuestra clase movie con nuevas propiedades(Descripción, Género, Actor, Director, País, Duración, Puntuación IMDB) pero usaremos(Descripción, Género, Actor) sólo para nuestra muestra.
public class Movie
{
public int Id { get; set; }
public string Title { get; set; }
public string URL { get; set; }
public string Description { get; set; }
public List<string> Genre { get; set; }
public List<string> Actor { get; set; }
}
public class Movie
{
public int Id { get; set; }
public string Title { get; set; }
public string URL { get; set; }
public string Description { get; set; }
public List<string> Genre { get; set; }
public List<string> Actor { get; set; }
}
Public Class Movie
Public Property Id() As Integer
Public Property Title() As String
Public Property URL() As String
Public Property Description() As String
Public Property Genre() As List(Of String)
Public Property Actor() As List(Of String)
End Class
Ahora navegaremos a la página Detallada para rasparla.
IronWebScraper le permite añadir más a la función de raspado para raspar diferentes tipos de formatos de página
Como podemos ver aquí:
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("https://domain/", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var movie = new Movie();
movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
var link = Divs.Css("a")[0];
movie.Title = link.TextContentClean;
movie.URL = link.Attributes ["href"];
this.Request(movie.URL, ParseDetails, new MetaData() { { "movie", movie } });// to scrap Detailed Page
}
}
}
public void ParseDetails(Response response)
{
var movie = response.MetaData.Get<Movie>("movie");
var Div = response.Css("div.mvic-desc")[0];
movie.Description = Div.Css("div.desc")[0].TextContentClean;
foreach(var Genre in Div.Css("div > p > a"))
{
movie.Genre.Add(Genre.TextContentClean);
}
foreach (var Actor in Div.Css("div > p:nth-child(2) > a"))
{
movie.Actor.Add(Actor.TextContentClean);
}
Scrape(movie, "Movie.Jsonl");
}
}
public class MovieScraper : WebScraper
{
public override void Init()
{
License.LicenseKey = "LicenseKey";
this.LoggingLevel = WebScraper.LogLevel.All;
this.WorkingDirectory = AppSetting.GetAppRoot() + @"\MovieSample\Output\";
this.Request("https://domain/", Parse);
}
public override void Parse(Response response)
{
foreach (var Divs in response.Css("#movie-featured > div"))
{
if (Divs.Attributes ["class"] != "clearfix")
{
var movie = new Movie();
movie.Id = Convert.ToInt32( Divs.GetAttribute("data-movie-id"));
var link = Divs.Css("a")[0];
movie.Title = link.TextContentClean;
movie.URL = link.Attributes ["href"];
this.Request(movie.URL, ParseDetails, new MetaData() { { "movie", movie } });// to scrap Detailed Page
}
}
}
public void ParseDetails(Response response)
{
var movie = response.MetaData.Get<Movie>("movie");
var Div = response.Css("div.mvic-desc")[0];
movie.Description = Div.Css("div.desc")[0].TextContentClean;
foreach(var Genre in Div.Css("div > p > a"))
{
movie.Genre.Add(Genre.TextContentClean);
}
foreach (var Actor in Div.Css("div > p:nth-child(2) > a"))
{
movie.Actor.Add(Actor.TextContentClean);
}
Scrape(movie, "Movie.Jsonl");
}
}
Public Class MovieScraper
Inherits WebScraper
Public Overrides Sub Init()
License.LicenseKey = "LicenseKey"
Me.LoggingLevel = WebScraper.LogLevel.All
Me.WorkingDirectory = AppSetting.GetAppRoot() & "\MovieSample\Output\"
Me.Request("https://domain/", AddressOf Parse)
End Sub
Public Overrides Sub Parse(ByVal response As Response)
For Each Divs In response.Css("#movie-featured > div")
If Divs.Attributes ("class") <> "clearfix" Then
Dim movie As New Movie()
movie.Id = Convert.ToInt32(Divs.GetAttribute("data-movie-id"))
Dim link = Divs.Css("a")(0)
movie.Title = link.TextContentClean
movie.URL = link.Attributes ("href")
Me.Request(movie.URL, AddressOf ParseDetails, New MetaData() From {
{ "movie", movie }
}) ' to scrap Detailed Page
End If
Next Divs
End Sub
Public Sub ParseDetails(ByVal response As Response)
Dim movie = response.MetaData.Get(Of Movie)("movie")
Dim Div = response.Css("div.mvic-desc")(0)
movie.Description = Div.Css("div.desc")(0).TextContentClean
For Each Genre In Div.Css("div > p > a")
movie.Genre.Add(Genre.TextContentClean)
Next Genre
For Each Actor In Div.Css("div > p:nth-child(2) > a")
movie.Actor.Add(Actor.TextContentClean)
Next Actor
Scrape(movie, "Movie.Jsonl")
End Sub
End Class
¿Qué hay de nuevo?
Podemos añadir funciones de raspado(ParseDetails) para raspar páginas detalladas
Trasladamos la función Scrape que genera nuestro fichero a la nueva función
Utilizamos la función IronWebScraper(MetaDatos) para pasar nuestro objeto de película a la nueva función scrape
- Raspamos la página y guardamos los datos de nuestros objetos de película en un archivo