/*Web Site Scraping, .NET. This sample demonstrates the PowerTCP WebASP Control in the .NET environment. */ using System; using System.Collections; using System.ComponentModel; using System.Data; using System.Drawing; using System.Web; using System.Web.SessionState; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.HtmlControls; namespace Http { /// /// Summary description for WebForm1. /// public class WebForm1 : System.Web.UI.Page { protected System.Web.UI.WebControls.Label lblUrl; protected System.Web.UI.WebControls.Label lblHeaders; protected System.Web.UI.WebControls.Label lblBody; protected System.Web.UI.WebControls.TextBox txtUrl; protected System.Web.UI.WebControls.Button cmdGetPage; protected System.Web.UI.WebControls.TextBox txtHeaders; protected System.Web.UI.WebControls.TextBox txtContent; protected System.Web.UI.WebControls.Label lblLinks; protected System.Web.UI.WebControls.Label lblImages; protected System.Web.UI.WebControls.Button cmdPrevImg; protected System.Web.UI.WebControls.Label lblImagesDisplay; protected System.Web.UI.WebControls.ListBox lstLinks; protected System.Web.UI.WebControls.Button cmdGetOtherPage; protected System.Web.UI.WebControls.Label lblTitle; protected System.Web.UI.WebControls.Button cmdNextImg; protected System.Web.UI.WebControls.Label lblImagesTitle; protected DartWebAspDotnet.WebASP WebAsp1 = new DartWebAspDotnet.WebASP(); // WebASP Control for HTTP communication protected DartWebAspDotnet.WebPage WebPage1 = new DartWebAspDotnet.WebPage(); // WebPage Object for parsing and resource collecting protected System.Collections.ArrayList ArrListImages = new System.Collections.ArrayList(); // holds list of images protected System.Collections.ArrayList ArrListLinks = new System.Collections.ArrayList(); // holds list of links public WebForm1() { Page.Init += new System.EventHandler(Page_Init); } private void Page_Load(object sender, System.EventArgs e) { // Put user code to initialize the page here } private void Page_Init(object sender, EventArgs e) { // // CODEGEN: This call is required by the ASP.NET Web Form Designer. // InitializeComponent(); } #region Web Form Designer generated code /// /// Required method for Designer support - do not modify /// the contents of this method with the code editor. /// private void InitializeComponent() { this.cmdGetPage.Click += new System.EventHandler(this.cmdGetPage_Click); this.cmdGetOtherPage.Click += new System.EventHandler(this.cmdGetOtherPage_Click); this.cmdPrevImg.Click += new System.EventHandler(this.cmdPrevImg_Click); this.cmdNextImg.Click += new System.EventHandler(this.cmdNextImg_Click); this.Load += new System.EventHandler(this.Page_Load); } #endregion private void GetPage(string url) { bool success = false; // true if HEAD commannd is successful ... the page exists! bool displayable = false; // true if content is presumed to be ascii ClearAll(); // set url to textbox content if(url != "") { WebAsp1.Request.Url = url; WebAsp1.Timeout = 20000; success = CheckHeader(ref displayable); if(success) { success = false; try { WebAsp1.Get(); success = true; } catch(Exception GetError) { txtHeaders.Text = "Get Error: " + GetError.Message; } if(success) { lblTitle.Text = "Page scrape for " + WebAsp1.Response.Url; txtUrl.Text = WebAsp1.Response.Url; if(!displayable) { // it must be binary txtHeaders.Text = "Not displayable, binary file"; } else { // it is probably ascii so display txtHeaders.Text = WebAsp1.Response.Header.All; // read source of retrieved page into WebPage Object so we can parse and resource collect WebPage1.Source = WebAsp1.Response.Body.ReadString(0); if(WebPage1.Source == "") { // some links return a valid header but no content txtHeaders.Text = "Page " + WebAsp1.Response.Url + " cannot be retrieved."; txtContent.Text = ""; } else { txtContent.Text = WebPage1.Source; // set UrlHost so relative links will be ok WebPage1.UrlHost = WebAsp1.Request.Url; // iterate through the resources, adding link and image urls to the appropriate array for(int i = 1; i < WebPage1.Resources.Count-1; i++) { Trace.Write("i: " + i.ToString()); Trace.Write("UrlType: " + WebPage1.Resources.Item(i).UrlType); Trace.Write("Url: " + WebPage1.Resources.Item(i).Url); if(WebPage1.Resources.Item(i).UrlType == DartWebAspDotnet.ResourceConstants.resA) if(WebPage1.Resources.Item(i).Url != null) ArrListLinks.Add(WebPage1.Resources.Item(i).Url); if(WebPage1.Resources.Item(i).UrlType == DartWebAspDotnet.ResourceConstants.resImg) if(WebPage1.Resources.Item(i).Url != "") ArrListImages.Add(WebPage1.Resources.Item(i).Url); } // check for any links if(ArrListLinks.Count == 0) ArrListLinks.Add("No links to display on this page"); // check for any images if(ArrListImages.Count == 0) { lblImages.Text = "No images to display"; cmdPrevImg.Enabled = false; cmdNextImg.Enabled = false; } else { // save current image and array of images to session, this will allow us to access after the postback Page.Session["CurrentImg"] = 0; Page.Session["ArrListImages"] = ArrListImages; ChangeImage("none"); } lstLinks.DataSource = ArrListLinks; lstLinks.DataBind(); } } } } } else { txtHeaders.Text = "Null URL. Unable to GET"; lstLinks.DataSource = ""; lstLinks.DataBind(); lblImagesDisplay.Text = ""; } } private void ClearAll() { ArrListLinks.Clear(); ArrListImages.Clear(); txtHeaders.Text = ""; txtContent.Text = ""; WebAsp1.Request.Header.Clear(); WebAsp1.Response.Header.Clear(); } private bool CheckHeader(ref bool displayable) { string temp; bool okHead = false; try { WebAsp1.Head(); okHead = true; } catch(Exception HeadError) { okHead = false; txtHeaders.Text = "Head Error: " + HeadError.Message; } int tempint; tempint = 0; temp = WebAsp1.Response.Header.Find(DartWebAspDotnet.HttpLabelConstants.httpContentType, "", ref tempint); temp = temp.ToLower(); // check for binary file if(temp.IndexOf("application") != -1 ) displayable = false; else if(temp.IndexOf("image") != -1) displayable = false; else if(temp.IndexOf("audio") != -1) displayable = false; else displayable = true; return okHead; } private void ChangeImage(string change) { /*change will either be "none" meaning display the same image, * "next" meaning display the next image, or "prev" meaning display the previous image */ System.Collections.ArrayList temparray = new System.Collections.ArrayList(); int tempint; int addto; // unload system var into temp array for easier handling temparray = (System.Collections.ArrayList)Session["ArrListImages"]; // same goes for current image tempint = (int)Session["CurrentImg"]; // check to see if we are at the first image and want to go to the previous... if(change == "prev" && tempint == 0) { tempint = temparray.Count - 1; addto = 0; } // ...or if we are at the last image and want to go next else if(change == "next" && tempint == (temparray.Count - 1)) { tempint = 0; addto = 0; } // otherwise process as normal else { switch(change) { case "none": addto = 0; break; case "prev": addto = -1; break; case "next": addto = 1; break; default: addto = 0; break; } } Session["CurrentImg"] = tempint + addto; lblImagesTitle.Text = "Image " + (((int)tempint+addto) + 1) + " of " + temparray.Count.ToString(); lblImagesDisplay.Text = ""; } private void cmdGetOtherPage_Click(object sender, System.EventArgs e) { GetPage(lstLinks.SelectedItem.ToString()); } private void cmdPrevImg_Click(object sender, System.EventArgs e) { ChangeImage("prev"); } private void cmdNextImg_Click(object sender, System.EventArgs e) { ChangeImage("next"); } private void cmdGetPage_Click(object sender, System.EventArgs e) { GetPage(txtUrl.Text); } } }