C#

웹 크롤링

탑~! 2018. 6. 29. 16:15

HtmlAgilityPack


https://www.nuget.org/packages/HtmlAgilityPack/

using System;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.XPath;
class Program
{
 static void Main(string[] args)
 {
 Uri targetUri = new Uri(“http://www.youtube.com/watch?v=8YkbeycRa2A"); HttpWebRequest webRequest = HttpWebRequest.Create(targetUri) as HttpWebRequest;
 using (HttpWebResponse webResponse = webRequest.GetResponse() as HttpWebResponse)
 using (Stream webResponseStream = webResponse.GetResponseStream())
 {
 HtmlDocument s = new HtmlDocument();
 Encoding targetEncoding = Encoding.UTF8;
s.Load(webResponseStream, targetEncoding, true);
 IXPathNavigable nav = s;
string title = WebUtility.HtmlDecode(nav.CreateNavigator().SelectSingleNode(“/html/head/meta[@property=’og:title’]/@content”).ToString());
 string description = WebUtility.HtmlDecode(nav.CreateNavigator().SelectSingleNode(“/html/head/meta[@property=’og:description’]/@content”).ToString());
 string fullDescription = WebUtility.HtmlDecode(s.GetElementbyId(“eow-description”).InnerHtml);
 fullDescription = Regex.Replace(fullDescription, @”<(br|hr)[^>]>”, Environment.NewLine);
 fullDescription = Regex.Replace(fullDescription, @”<[^>]
>”, String.Empty).Trim();
Console.WriteLine(title);
 Console.WriteLine(description);
 Console.WriteLine(fullDescription);
 }
 }
}



PhantomJS (http://phantomjs.org/)  (http://www.nuget.org/packages/PhantomJS/)


Selenium Web Driver (NuGet 패키지: http://www.nuget.org/packages/Selenium.WebDriver/)