/*Web Site Scraping, .NET. This sample demonstrates the PowerTCP WebASP Control in the .NET environment.
*/
using System;
using System.Collections;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
namespace Http
{
///
/// Summary description for WebForm1.
///
public class WebForm1 : System.Web.UI.Page
{
protected System.Web.UI.WebControls.Label lblUrl;
protected System.Web.UI.WebControls.Label lblHeaders;
protected System.Web.UI.WebControls.Label lblBody;
protected System.Web.UI.WebControls.TextBox txtUrl;
protected System.Web.UI.WebControls.Button cmdGetPage;
protected System.Web.UI.WebControls.TextBox txtHeaders;
protected System.Web.UI.WebControls.TextBox txtContent;
protected System.Web.UI.WebControls.Label lblLinks;
protected System.Web.UI.WebControls.Label lblImages;
protected System.Web.UI.WebControls.Button cmdPrevImg;
protected System.Web.UI.WebControls.Label lblImagesDisplay;
protected System.Web.UI.WebControls.ListBox lstLinks;
protected System.Web.UI.WebControls.Button cmdGetOtherPage;
protected System.Web.UI.WebControls.Label lblTitle;
protected System.Web.UI.WebControls.Button cmdNextImg;
protected System.Web.UI.WebControls.Label lblImagesTitle;
protected DartWebAspDotnet.WebASP WebAsp1 = new DartWebAspDotnet.WebASP(); // WebASP Control for HTTP communication
protected DartWebAspDotnet.WebPage WebPage1 = new DartWebAspDotnet.WebPage(); // WebPage Object for parsing and resource collecting
protected System.Collections.ArrayList ArrListImages = new System.Collections.ArrayList(); // holds list of images
protected System.Collections.ArrayList ArrListLinks = new System.Collections.ArrayList(); // holds list of links
public WebForm1()
{
Page.Init += new System.EventHandler(Page_Init);
}
private void Page_Load(object sender, System.EventArgs e)
{
// Put user code to initialize the page here
}
private void Page_Init(object sender, EventArgs e)
{
//
// CODEGEN: This call is required by the ASP.NET Web Form Designer.
//
InitializeComponent();
}
#region Web Form Designer generated code
///
/// Required method for Designer support - do not modify
/// the contents of this method with the code editor.
///
private void InitializeComponent()
{
this.cmdGetPage.Click += new System.EventHandler(this.cmdGetPage_Click);
this.cmdGetOtherPage.Click += new System.EventHandler(this.cmdGetOtherPage_Click);
this.cmdPrevImg.Click += new System.EventHandler(this.cmdPrevImg_Click);
this.cmdNextImg.Click += new System.EventHandler(this.cmdNextImg_Click);
this.Load += new System.EventHandler(this.Page_Load);
}
#endregion
private void GetPage(string url)
{
bool success = false; // true if HEAD commannd is successful ... the page exists!
bool displayable = false; // true if content is presumed to be ascii
ClearAll();
// set url to textbox content
if(url != "")
{
WebAsp1.Request.Url = url;
WebAsp1.Timeout = 20000;
success = CheckHeader(ref displayable);
if(success)
{
success = false;
try
{
WebAsp1.Get();
success = true;
}
catch(Exception GetError)
{
txtHeaders.Text = "Get Error: " + GetError.Message;
}
if(success)
{
lblTitle.Text = "Page scrape for " + WebAsp1.Response.Url;
txtUrl.Text = WebAsp1.Response.Url;
if(!displayable)
{
// it must be binary
txtHeaders.Text = "Not displayable, binary file";
}
else
{
// it is probably ascii so display
txtHeaders.Text = WebAsp1.Response.Header.All;
// read source of retrieved page into WebPage Object so we can parse and resource collect
WebPage1.Source = WebAsp1.Response.Body.ReadString(0);
if(WebPage1.Source == "")
{
// some links return a valid header but no content
txtHeaders.Text = "Page " + WebAsp1.Response.Url + " cannot be retrieved.";
txtContent.Text = "";
}
else
{
txtContent.Text = WebPage1.Source;
// set UrlHost so relative links will be ok
WebPage1.UrlHost = WebAsp1.Request.Url;
// iterate through the resources, adding link and image urls to the appropriate array
for(int i = 1; i < WebPage1.Resources.Count-1; i++)
{
Trace.Write("i: " + i.ToString());
Trace.Write("UrlType: " + WebPage1.Resources.Item(i).UrlType);
Trace.Write("Url: " + WebPage1.Resources.Item(i).Url);
if(WebPage1.Resources.Item(i).UrlType == DartWebAspDotnet.ResourceConstants.resA)
if(WebPage1.Resources.Item(i).Url != null)
ArrListLinks.Add(WebPage1.Resources.Item(i).Url);
if(WebPage1.Resources.Item(i).UrlType == DartWebAspDotnet.ResourceConstants.resImg)
if(WebPage1.Resources.Item(i).Url != "")
ArrListImages.Add(WebPage1.Resources.Item(i).Url);
}
// check for any links
if(ArrListLinks.Count == 0)
ArrListLinks.Add("No links to display on this page");
// check for any images
if(ArrListImages.Count == 0)
{
lblImages.Text = "No images to display";
cmdPrevImg.Enabled = false;
cmdNextImg.Enabled = false;
}
else
{
// save current image and array of images to session, this will allow us to access after the postback
Page.Session["CurrentImg"] = 0;
Page.Session["ArrListImages"] = ArrListImages;
ChangeImage("none");
}
lstLinks.DataSource = ArrListLinks;
lstLinks.DataBind();
}
}
}
}
}
else
{
txtHeaders.Text = "Null URL. Unable to GET";
lstLinks.DataSource = "";
lstLinks.DataBind();
lblImagesDisplay.Text = "";
}
}
private void ClearAll()
{
ArrListLinks.Clear();
ArrListImages.Clear();
txtHeaders.Text = "";
txtContent.Text = "";
WebAsp1.Request.Header.Clear();
WebAsp1.Response.Header.Clear();
}
private bool CheckHeader(ref bool displayable)
{
string temp;
bool okHead = false;
try
{
WebAsp1.Head();
okHead = true;
}
catch(Exception HeadError)
{
okHead = false;
txtHeaders.Text = "Head Error: " + HeadError.Message;
}
int tempint;
tempint = 0;
temp = WebAsp1.Response.Header.Find(DartWebAspDotnet.HttpLabelConstants.httpContentType, "", ref tempint);
temp = temp.ToLower();
// check for binary file
if(temp.IndexOf("application") != -1 )
displayable = false;
else if(temp.IndexOf("image") != -1)
displayable = false;
else if(temp.IndexOf("audio") != -1)
displayable = false;
else
displayable = true;
return okHead;
}
private void ChangeImage(string change)
{
/*change will either be "none" meaning display the same image,
* "next" meaning display the next image, or "prev" meaning display the previous image
*/
System.Collections.ArrayList temparray = new System.Collections.ArrayList();
int tempint;
int addto;
// unload system var into temp array for easier handling
temparray = (System.Collections.ArrayList)Session["ArrListImages"];
// same goes for current image
tempint = (int)Session["CurrentImg"];
// check to see if we are at the first image and want to go to the previous...
if(change == "prev" && tempint == 0)
{
tempint = temparray.Count - 1;
addto = 0;
}
// ...or if we are at the last image and want to go next
else if(change == "next" && tempint == (temparray.Count - 1))
{
tempint = 0;
addto = 0;
}
// otherwise process as normal
else
{
switch(change)
{
case "none":
addto = 0;
break;
case "prev":
addto = -1;
break;
case "next":
addto = 1;
break;
default:
addto = 0;
break;
}
}
Session["CurrentImg"] = tempint + addto;
lblImagesTitle.Text = "Image " + (((int)tempint+addto) + 1) + " of " + temparray.Count.ToString();
lblImagesDisplay.Text = "
";
}
private void cmdGetOtherPage_Click(object sender, System.EventArgs e)
{
GetPage(lstLinks.SelectedItem.ToString());
}
private void cmdPrevImg_Click(object sender, System.EventArgs e)
{
ChangeImage("prev");
}
private void cmdNextImg_Click(object sender, System.EventArgs e)
{
ChangeImage("next");
}
private void cmdGetPage_Click(object sender, System.EventArgs e)
{
GetPage(txtUrl.Text);
}
}
}