C#
웹 크롤링
탑~!
2018. 6. 29. 16:15
HtmlAgilityPack
https://www.nuget.org/packages/HtmlAgilityPack/
using System;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.XPath;
class Program
{
static void Main(string[] args)
{
Uri targetUri = new Uri(“http://www.youtube.com/watch?v=8YkbeycRa2A"); HttpWebRequest webRequest = HttpWebRequest.Create(targetUri) as HttpWebRequest;
using (HttpWebResponse webResponse = webRequest.GetResponse() as HttpWebResponse)
using (Stream webResponseStream = webResponse.GetResponseStream())
{
HtmlDocument s = new HtmlDocument();
Encoding targetEncoding = Encoding.UTF8;
s.Load(webResponseStream, targetEncoding, true);
IXPathNavigable nav = s;
string title = WebUtility.HtmlDecode(nav.CreateNavigator().SelectSingleNode(“/html/head/meta[@property=’og:title’]/@content”).ToString());
string description = WebUtility.HtmlDecode(nav.CreateNavigator().SelectSingleNode(“/html/head/meta[@property=’og:description’]/@content”).ToString());
string fullDescription = WebUtility.HtmlDecode(s.GetElementbyId(“eow-description”).InnerHtml);
fullDescription = Regex.Replace(fullDescription, @”<(br|hr)[^>]>”, Environment.NewLine);
fullDescription = Regex.Replace(fullDescription, @”<[^>]>”, String.Empty).Trim();
Console.WriteLine(title);
Console.WriteLine(description);
Console.WriteLine(fullDescription);
}
}
}
PhantomJS (http://phantomjs.org/) (http://www.nuget.org/packages/PhantomJS/)
Selenium Web Driver (NuGet 패키지: http://www.nuget.org/packages/Selenium.WebDriver/)