using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Net; using System.IO; using System.Drawing; using System.Threading; using System.Runtime.InteropServices; using HtmlAgilityPack; using ICSharpCode.SharpZipLib.Zip; // For sniffing inside zip files using ICSharpCode.SharpZipLib.GZip; using ICSharpCode.SharpZipLib.Tar; using Devolutions.Utils; namespace HttpWebCrawler { public delegate void updateImageURLListDelegate(string imageURL, Image image, int imageCount); // Event handler for updating image list on GUI public delegate void updateSiteURLListDelegate(string URL); /// /// A result encapsulating the Url and the HtmlDocument /// public class WebPage { public Uri Url { get; set; } private static string[] excludedFileTags = { "aspx", "asp", "doc", "ppt", "jsp", "ps", "php", "java" }; private static string[] imageFileTags = { "JPG", "PNG", "GIF", "BMP", "TIF", "TIFF", "PGM" }; private static event updateImageURLListDelegate updateImageURLListEvent; private static event updateSiteURLListDelegate updateSiteURLListEvent; private static int imageCount; private static void getZipFileEntries(string url) { // Sniff inside a zip file Image image = null; int width = 0; int height = 0; MemoryStream ms = new MemoryStream(); try { ZipInputStream s = new ZipInputStream(new WebClient().OpenRead(url)); if (s != null) { ZipEntry theEntry; while ((theEntry = s.GetNextEntry()) != null) { string entryName = theEntry.Name.ToUpper(); Boolean tagFound = false; foreach (string s1 in imageFileTags) if (entryName.EndsWith(s1)) tagFound = true; if (tagFound) { try { StreamUtils.Copy(s, ms); image = Image.FromStream(ms); width = image.Width; height = image.Height; imageCount++; fireUpdateImageURLListEvent(url.ToString() + ":" + theEntry.Name, image, imageCount); // fire event for GUI } catch (ArgumentException) { } } } } } catch (ZipException) { } catch (ArgumentOutOfRangeException) { } catch (ExternalException) { } catch (OutOfMemoryException) { GC.Collect(); GC.WaitForPendingFinalizers(); } } private static void getTarFileEntries(string url) { // Sniff inside a tar file Image image = null; int width = 0; int height = 0; MemoryStream ms = new MemoryStream(); try { TarInputStream s = new TarInputStream(new WebClient().OpenRead(url)); if (s != null) { TarEntry theEntry; while ((theEntry = s.GetNextEntry()) != null) { string entryName = theEntry.Name.ToUpper(); Boolean tagFound = false; foreach (string s1 in imageFileTags) if (entryName.EndsWith(s1)) tagFound = true; if (tagFound) { try { StreamUtils.Copy(s, ms); image = Image.FromStream(ms); width = image.Width; height = image.Height; imageCount++; fireUpdateImageURLListEvent(url.ToString() + ":" + theEntry.Name, image, imageCount); // fire event for GUI } catch (ArgumentException) { } } } } } catch (TarException) {} catch (ArgumentOutOfRangeException) {} catch (ExternalException) { } catch (OutOfMemoryException) { GC.Collect(); GC.WaitForPendingFinalizers(); } } private static void getGZipFileEntries(string url) { // Decompress and sniff inside a tarball Image image = null; int width = 0; int height = 0; MemoryStream ms = new MemoryStream(); try { Stream gzipStream = new GZipInputStream(new WebClient().OpenRead(url)); Stream s = StreamUtils.GetUncompressedGZipStream(StreamUtils.GetCompressedGZipStream(gzipStream)); TarInputStream ts = new TarInputStream(s); if (ts != null) { TarEntry theEntry; while ((theEntry = ts.GetNextEntry()) != null) { string entryName = theEntry.Name.ToUpper(); Boolean tagFound = false; foreach (string s1 in imageFileTags) if (entryName.EndsWith(s1)) tagFound = true; if (tagFound) { try { StreamUtils.Copy(ts, ms); image = Image.FromStream(ms); width = image.Width; height = image.Height; imageCount++; fireUpdateImageURLListEvent(url.ToString() + ":" + theEntry.Name, image, imageCount); // fire event for GUI } catch (ArgumentException) { } } } } } catch (GZipException) { } catch (TarException) { } catch (ArgumentOutOfRangeException) { } catch (IndexOutOfRangeException) { } catch (ExternalException) { } catch (OutOfMemoryException) { GC.Collect(); GC.WaitForPendingFinalizers(); } } public static event updateImageURLListDelegate UpdateImageURLListEvent { add { updateImageURLListEvent += value; } remove { updateImageURLListEvent -= value; } } public static event updateSiteURLListDelegate UpdateSiteURLListEvent { add { updateSiteURLListEvent += value; } remove { updateSiteURLListEvent -= value; } } public static void fireUpdateImageURLListEvent(string imageURL, Image image, int imageCount) { updateImageURLListEvent(imageURL,image, imageCount); } public static void fireUpdateSiteURLListEvent(string siteURLData) { updateSiteURLListEvent(siteURLData); } public static void GetAllPagesUnder(object arg) { Uri urlRoot = (Uri)arg; var queue = new Queue(); var allSiteUrls = new HashSet(); queue.Enqueue(urlRoot); allSiteUrls.Add(urlRoot); imageCount = 0; string urlPath = null; Boolean tagFound = false; while (queue.Count > 0) { Uri url = queue.Dequeue(); try { if (url.ToString().StartsWith("http://") == false) continue; urlPath = url.ToString(); // Get path of html files if ((url.ToString().EndsWith(".html")) || (url.ToString().EndsWith(".htm"))) { int i = url.ToString().LastIndexOf("/"); urlPath = url.ToString().Substring(0, i); } HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url); oReq.Timeout = 1000; HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse(); if (resp.StatusCode != HttpStatusCode.OK) continue; if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase)) { HtmlDocument doc = new HtmlDocument(); var resultStream = resp.GetResponseStream(); doc.Load(resultStream); // The HtmlAgilityPack fireUpdateSiteURLListEvent(url.ToString()); // fire event for GUI // queue up all the links on this page // Get image tags if (doc.DocumentNode.SelectNodes(@"//img[@src]") != null) foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//img")) { HtmlAttribute att = link.Attributes["src"]; if (att == null) continue; string src = att.Value; Uri urlNext = new Uri(src, UriKind.RelativeOrAbsolute); // Make it absolute if it's relative if (!urlNext.IsAbsoluteUri) { urlNext = new Uri(urlPath + "/" + src); } if (!allSiteUrls.Contains(urlNext)) { allSiteUrls.Add(urlNext); // keep track of every image url // Check for an image file extension String URLtoUpper = urlNext.ToString().ToUpper(); if (URLtoUpper != null) { tagFound = false; foreach (string s in imageFileTags) if (URLtoUpper.EndsWith(s)) tagFound = true; if (tagFound) { try { Stream ImageStream = new WebClient().OpenRead(urlNext); Image img = Image.FromStream(ImageStream); int width = img.Width; int height = img.Height; imageCount++; fireUpdateImageURLListEvent(urlNext.ToString(), img, imageCount); // fire event for GUI } catch (Exception) { } } } } } // Get href tags and queue them if (doc.DocumentNode.SelectNodes(@"//a[@href]") != null) foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//a[@href]")) { HtmlAttribute att = link.Attributes["href"]; if (att == null) continue; string href = att.Value; if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase)) continue; // Skip to end of loop. Ignore javascript on buttons using a tags bool skip = false; foreach (string s in excludedFileTags) if (href.EndsWith(s, StringComparison.InvariantCultureIgnoreCase)) skip = true; if (skip) continue; Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute); // Make it absolute if it's relative if (!urlNext.IsAbsoluteUri) { urlNext = new Uri(urlPath + "/" + href); } if (!allSiteUrls.Contains(urlNext)) { allSiteUrls.Add(urlNext); // keep track of every page we've queued // Check for an image file extension String URLtoUpper = urlNext.ToString().ToUpper(); if (URLtoUpper != null) { tagFound = false; foreach (string s in imageFileTags) if (URLtoUpper.EndsWith(s)) tagFound = true; if (tagFound) { try { Stream ImageStream = new WebClient().OpenRead(urlNext); Image img = Image.FromStream(ImageStream); int width = img.Width; int height = img.Height; imageCount++; fireUpdateImageURLListEvent(urlNext.ToString(), img, imageCount); // fire event for GUI } catch (Exception) { } } else if (URLtoUpper.EndsWith("ZIP")) { // Sniff inside a zip file getZipFileEntries(urlNext.ToString()); } else if (URLtoUpper.EndsWith("TAR")) { // Sniff inside a tar file getTarFileEntries(urlNext.ToString()); } else if (URLtoUpper.EndsWith("TAR.GZ")) { // Unzip and check for zip or tar files getGZipFileEntries(urlNext.ToString()); } else { queue.Enqueue(urlNext); } } } } } } catch (WebException) { } catch (UriFormatException) { } catch (OutOfMemoryException) { GC.Collect(); GC.WaitForPendingFinalizers(); } } } } }