using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Drawing;
using System.Threading;
using System.Runtime.InteropServices;
using HtmlAgilityPack;
using ICSharpCode.SharpZipLib.Zip; // For sniffing inside zip files
using ICSharpCode.SharpZipLib.GZip;
using ICSharpCode.SharpZipLib.Tar;
using Devolutions.Utils;
namespace HttpWebCrawler
{
public delegate void updateImageURLListDelegate(string imageURL, Image image, int imageCount); // Event handler for updating image list on GUI
public delegate void updateSiteURLListDelegate(string URL);
///
/// A result encapsulating the Url and the HtmlDocument
///
public class WebPage
{
public Uri Url { get; set; }
private static string[] excludedFileTags = { "aspx", "asp", "doc", "ppt", "jsp", "ps", "php", "java" };
private static string[] imageFileTags = { "JPG", "PNG", "GIF", "BMP", "TIF", "TIFF", "PGM" };
private static event updateImageURLListDelegate updateImageURLListEvent;
private static event updateSiteURLListDelegate updateSiteURLListEvent;
private static int imageCount;
private static void getZipFileEntries(string url)
{
// Sniff inside a zip file
Image image = null;
int width = 0;
int height = 0;
MemoryStream ms = new MemoryStream();
try
{
ZipInputStream s = new ZipInputStream(new WebClient().OpenRead(url));
if (s != null)
{
ZipEntry theEntry;
while ((theEntry = s.GetNextEntry()) != null)
{
string entryName = theEntry.Name.ToUpper();
Boolean tagFound = false;
foreach (string s1 in imageFileTags)
if (entryName.EndsWith(s1))
tagFound = true;
if (tagFound)
{
try
{
StreamUtils.Copy(s, ms);
image = Image.FromStream(ms);
width = image.Width;
height = image.Height;
imageCount++;
fireUpdateImageURLListEvent(url.ToString() + ":" + theEntry.Name, image, imageCount); // fire event for GUI
}
catch (ArgumentException)
{ }
}
}
}
}
catch (ZipException)
{ }
catch (ArgumentOutOfRangeException)
{ }
catch (ExternalException)
{ }
catch (OutOfMemoryException)
{
GC.Collect();
GC.WaitForPendingFinalizers();
}
}
private static void getTarFileEntries(string url)
{
// Sniff inside a tar file
Image image = null;
int width = 0;
int height = 0;
MemoryStream ms = new MemoryStream();
try
{
TarInputStream s = new TarInputStream(new WebClient().OpenRead(url));
if (s != null)
{
TarEntry theEntry;
while ((theEntry = s.GetNextEntry()) != null)
{
string entryName = theEntry.Name.ToUpper();
Boolean tagFound = false;
foreach (string s1 in imageFileTags)
if (entryName.EndsWith(s1))
tagFound = true;
if (tagFound)
{
try
{
StreamUtils.Copy(s, ms);
image = Image.FromStream(ms);
width = image.Width;
height = image.Height;
imageCount++;
fireUpdateImageURLListEvent(url.ToString() + ":" + theEntry.Name, image, imageCount); // fire event for GUI
}
catch (ArgumentException)
{ }
}
}
}
}
catch (TarException)
{}
catch (ArgumentOutOfRangeException)
{}
catch (ExternalException)
{ }
catch (OutOfMemoryException)
{
GC.Collect();
GC.WaitForPendingFinalizers();
}
}
private static void getGZipFileEntries(string url)
{
// Decompress and sniff inside a tarball
Image image = null;
int width = 0;
int height = 0;
MemoryStream ms = new MemoryStream();
try
{
Stream gzipStream = new GZipInputStream(new WebClient().OpenRead(url));
Stream s = StreamUtils.GetUncompressedGZipStream(StreamUtils.GetCompressedGZipStream(gzipStream));
TarInputStream ts = new TarInputStream(s);
if (ts != null)
{
TarEntry theEntry;
while ((theEntry = ts.GetNextEntry()) != null)
{
string entryName = theEntry.Name.ToUpper();
Boolean tagFound = false;
foreach (string s1 in imageFileTags)
if (entryName.EndsWith(s1))
tagFound = true;
if (tagFound)
{
try
{
StreamUtils.Copy(ts, ms);
image = Image.FromStream(ms);
width = image.Width;
height = image.Height;
imageCount++;
fireUpdateImageURLListEvent(url.ToString() + ":" + theEntry.Name, image, imageCount); // fire event for GUI
}
catch (ArgumentException)
{ }
}
}
}
}
catch (GZipException)
{ }
catch (TarException)
{ }
catch (ArgumentOutOfRangeException)
{ }
catch (IndexOutOfRangeException)
{ }
catch (ExternalException)
{ }
catch (OutOfMemoryException)
{
GC.Collect();
GC.WaitForPendingFinalizers();
}
}
public static event updateImageURLListDelegate UpdateImageURLListEvent
{
add
{
updateImageURLListEvent += value;
}
remove
{
updateImageURLListEvent -= value;
}
}
public static event updateSiteURLListDelegate UpdateSiteURLListEvent
{
add
{
updateSiteURLListEvent += value;
}
remove
{
updateSiteURLListEvent -= value;
}
}
public static void fireUpdateImageURLListEvent(string imageURL, Image image, int imageCount)
{
updateImageURLListEvent(imageURL,image, imageCount);
}
public static void fireUpdateSiteURLListEvent(string siteURLData)
{
updateSiteURLListEvent(siteURLData);
}
public static void GetAllPagesUnder(object arg)
{
Uri urlRoot = (Uri)arg;
var queue = new Queue();
var allSiteUrls = new HashSet();
queue.Enqueue(urlRoot);
allSiteUrls.Add(urlRoot);
imageCount = 0;
string urlPath = null;
Boolean tagFound = false;
while (queue.Count > 0)
{
Uri url = queue.Dequeue();
try
{
if (url.ToString().StartsWith("http://") == false)
continue;
urlPath = url.ToString();
// Get path of html files
if ((url.ToString().EndsWith(".html")) || (url.ToString().EndsWith(".htm")))
{
int i = url.ToString().LastIndexOf("/");
urlPath = url.ToString().Substring(0, i);
}
HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(url);
oReq.Timeout = 1000;
HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse();
if (resp.StatusCode != HttpStatusCode.OK)
continue;
if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase))
{
HtmlDocument doc = new HtmlDocument();
var resultStream = resp.GetResponseStream();
doc.Load(resultStream); // The HtmlAgilityPack
fireUpdateSiteURLListEvent(url.ToString()); // fire event for GUI
// queue up all the links on this page
// Get image tags
if (doc.DocumentNode.SelectNodes(@"//img[@src]") != null)
foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//img"))
{
HtmlAttribute att = link.Attributes["src"];
if (att == null) continue;
string src = att.Value;
Uri urlNext = new Uri(src, UriKind.RelativeOrAbsolute);
// Make it absolute if it's relative
if (!urlNext.IsAbsoluteUri)
{
urlNext = new Uri(urlPath + "/" + src);
}
if (!allSiteUrls.Contains(urlNext))
{
allSiteUrls.Add(urlNext); // keep track of every image url
// Check for an image file extension
String URLtoUpper = urlNext.ToString().ToUpper();
if (URLtoUpper != null)
{
tagFound = false;
foreach (string s in imageFileTags)
if (URLtoUpper.EndsWith(s))
tagFound = true;
if (tagFound)
{
try
{
Stream ImageStream = new WebClient().OpenRead(urlNext);
Image img = Image.FromStream(ImageStream);
int width = img.Width;
int height = img.Height;
imageCount++;
fireUpdateImageURLListEvent(urlNext.ToString(), img, imageCount); // fire event for GUI
}
catch (Exception)
{ }
}
}
}
}
// Get href tags and queue them
if (doc.DocumentNode.SelectNodes(@"//a[@href]") != null)
foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
if (att == null) continue;
string href = att.Value;
if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase)) continue; // Skip to end of loop. Ignore javascript on buttons using a tags
bool skip = false;
foreach (string s in excludedFileTags)
if (href.EndsWith(s, StringComparison.InvariantCultureIgnoreCase))
skip = true;
if (skip)
continue;
Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute);
// Make it absolute if it's relative
if (!urlNext.IsAbsoluteUri)
{
urlNext = new Uri(urlPath + "/" + href);
}
if (!allSiteUrls.Contains(urlNext))
{
allSiteUrls.Add(urlNext); // keep track of every page we've queued
// Check for an image file extension
String URLtoUpper = urlNext.ToString().ToUpper();
if (URLtoUpper != null)
{
tagFound = false;
foreach (string s in imageFileTags)
if (URLtoUpper.EndsWith(s))
tagFound = true;
if (tagFound)
{
try
{
Stream ImageStream = new WebClient().OpenRead(urlNext);
Image img = Image.FromStream(ImageStream);
int width = img.Width;
int height = img.Height;
imageCount++;
fireUpdateImageURLListEvent(urlNext.ToString(), img, imageCount); // fire event for GUI
}
catch (Exception)
{ }
}
else if (URLtoUpper.EndsWith("ZIP"))
{
// Sniff inside a zip file
getZipFileEntries(urlNext.ToString());
}
else if (URLtoUpper.EndsWith("TAR"))
{
// Sniff inside a tar file
getTarFileEntries(urlNext.ToString());
}
else if (URLtoUpper.EndsWith("TAR.GZ"))
{
// Unzip and check for zip or tar files
getGZipFileEntries(urlNext.ToString());
}
else
{
queue.Enqueue(urlNext);
}
}
}
}
}
}
catch (WebException)
{ }
catch (UriFormatException)
{ }
catch (OutOfMemoryException)
{
GC.Collect();
GC.WaitForPendingFinalizers();
}
}
}
}
}