web crawler

Ketan Jetty
enthusiasm for technology

web crawler

How to use the web crawler

How to use the web crawler / spider in your code
<cfset domainURL = "http://ketanjetty.com" />

<cfset objSpdider = CreateObject("component", "spider").init(domainURL) />
<cfset spideredLinks = objSpdider.spider(domainURL) />

<cfdump var="#spideredLinks#">

Web Crawler code in coldfusion

spider.cfc :: web crawler component
<cfcomponent>
	<cfset variables.domain = "" />	
	<cfset variables.allLinks = StructNew() />
	
	<cffunction name="init" access="public" returntype="any">
		<cfargument name="domain" required="true" type="string">
		
		<cfset variables.domain = arguments.domain />
		<cfset StructInsert(variables.allLinks, variables.domain, "")>
		
		<cfreturn this />
	</cffunction>
	
	<cffunction name="getPageContent" access="private" returntype="string">
		<cfargument name="pageURL" required="true" type="string">
	
		<cfset var retVal = "" />
		<cfhttp url="#trim(arguments.pageURL)#" method="get"></cfhttp>
		<cfset retVal = LCase(cfhttp.fileContent) />
		
		<cfreturn retVal />
	</cffunction>
	
	<cffunction name="getPageLinksArray" access="private" returntype="array">
		<cfargument name="pageContent" required="true" type="string">
	
		<cfset var hrefArray = arguments.pageContent.split("href=") />
		
		<cfreturn hrefArray />
	</cffunction>
	
	<cffunction name="getProcessedLinks" access="private" returntype="struct">
		<cfargument name="hrefArray" required="true" type="array">
	
		<cfset var temp = "" />
		<cfset var pageLinksStruct = StructNew() />
		<cfset var filesToAvoid = '{ "!":"", ".css":"", ".js":"", ".jpeg":"", ".jpg":"", ".bmp":"", ".gif":"", ".png":"", ".cfc":"", ".swf":"", ".pdf":"", ".ico":"", ".xml":"", ".xls":"", ".doc":"", ".exe":"", ".jar":"", ".tar":"", ".mp3":"" }' />
	
		<cftry>
			<cfloop from="1" to="#ArrayLen(arguments.hrefArray)#" index="i">
				<cfif Find(">",arguments.hrefArray[i])>
					<cfset temp = Left(arguments.hrefArray[i],Find(">",arguments.hrefArray[i])) & " " />
					<cfset temp = Left(temp,Find(" ",temp)) />
					<cfset temp = Replace(temp,">"," ","all") />
					<cfset temp = Replace(temp,"<","","all") />
					<cfset temp = Replace(temp,"'","","all") />
					<cfset temp = Replace(temp,'"','','all') />
					<cfset temp = trim(removeSlash(trim(temp))) />				
					
					<cfset insert = 1 />
					
					<cfloop collection="#DeSerializeJson(filesToAvoid)#" item="key"> 
						<cfif FindNoCase(key,temp)>
							<cfset insert = 0 />
						</cfif>
					</cfloop>
					
					<cfif insert AND FindNoCase(variables.domain, temp) > 	
						<cfif NOT StructKeyExists(pageLinksStruct, temp)>
							<cfset StructInsert(pageLinksStruct, temp, "" )>
						</cfif>
					</cfif>
				</cfif>
			</cfloop>
		<cfcatch type="any">
			<cfdump var="#cfcatch#">
		</cfcatch>
		</cftry>
		
		<cfreturn pageLinksStruct />
	</cffunction>
		
	<cffunction name="removeSlash" access="private" returntype="string">
		<cfargument name="pageURL" required="true" type="string">
		
		<cfset var page = arguments.pageURL />
		
		<cfif Right(page,1) EQ "/">
			<cfset page = left(page,len(page)-1)>
		</cfif>
		
		<cfreturn page />
	</cffunction>
	
	<cffunction name="addLinks" access="private" returntype="void">
		<cfargument name="pageLinks" required="true" type="struct">
		
		<cfset var key = "" />
		
		<cfloop collection="#arguments.pageLinks#" item="key"> 
			<cfif NOT StructKeyExists(variables.allLinks, key)>
				<cfset StructInsert(variables.allLinks, key, "" )>
			</cfif>
		</cfloop>
	</cffunction>
	
	<cffunction name="spider" access="public" output="true" returntype="any">
		<cfargument name="_page" required="true" type="string">
		
		<cfset var page = removeSlash(arguments._page) />
		<cfset var pageLinks = "" />
		<cfset var key = "" />

		<cfset pageLinks = getProcessedLinks(getPageLinksArray(getPageContent(page))) />
		<cfset addLinks(pageLinks) />
		<cfset variables.allLinks[page] = "processed">
		
		<cfloop collection="#variables.allLinks#" item="key"> 
			<cfif variables.allLinks[key] NEQ "processed">
				<cfset spider(key)>
				<cfbreak />
			</cfif>
		</cfloop>
		
		<cfreturn variables.allLinks>
	</cffunction>
</cfcomponent>

Web Crawler code in CSharp

Web Crawler code in CSharp
using System;
using System.Text;
using System.Net;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Linq;

namespace KJSpider
{
    class Program
    {
        static string websitename = string.Empty;
        static ArrayList alUrls = new ArrayList();
        static ArrayList alTempUrls = new ArrayList();
        static string timeStart = string.Empty;
        static string timeEnd = string.Empty;

        static void Main(string[] args)
        {
            string websiteurl = string.Empty;
            int iterations = 3;

            if (args.Length > 0)
            {
                websiteurl = args[0];

                if (args.Length > 1)
                {
                    iterations = Convert.ToInt32(args[1]);
                }

                Console.WriteLine("websiteurl " + websiteurl);
                Console.Write(" | iterations " + iterations);
                Console.WriteLine("");

                startSpider(websiteurl, iterations);
            }
            else
            {
                Console.WriteLine("no arguments passed");
            }
			
			// Test
			// startSpider("http://ketanjetty.com");
			//startSpider("http://ketanjetty.com", 5);

            //Console.ReadLine();
        }

        public static void startSpider(string url)
        {
            // do default 3 iterations
            startSpider(url, 3);
        }

        public static void startSpider(string url, int iterations)
        {
            // get the web site name
            websitename = url.ToLower();
            websitename = websitename.Replace("http://", "");
            websitename = websitename.Replace("www", "");

            // add the url to the repository
            alUrls.Add(url);

            // process spidering 
            timeStart = DateTime.Now.ToString();
            for (int i = 0; i < iterations; i++)
            {
                Console.WriteLine("starting iteration: " + i);
                Console.WriteLine("");

                // clear the url's in the temporary repository and copy the latest from the central repository
                alTempUrls.Clear();
                alTempUrls = (ArrayList)alUrls.Clone();

                spiderRecursive();
                Console.WriteLine("");
            }
            timeEnd = DateTime.Now.ToString();

            // write the results to the log
            writeToLog();
        }

        static void spiderRecursive()
        {
            foreach (string strUrl in alTempUrls)
            {
                spiderURL(strUrl.Trim());
            }
        }

        static void spiderURL(string strUrl)
        {
            string baseUrl = strUrl;
            string link = null;
            string pageData;
            int curloc;             // holds current location in response 

            // check for end "/"
            if (!baseUrl.EndsWith("/"))
            {
                baseUrl += "/";
            }

            if (!strUrl.ToLower().Contains("*spidered::"))
            {
                if (alUrls.IndexOf(strUrl) > -1)
                {
                    alUrls[alUrls.IndexOf(strUrl)] = "*spidered::" + alUrls[alUrls.IndexOf(strUrl)].ToString();
                }

                try
                {
                    do
                    {
                        Console.WriteLine("Spidering... " + strUrl);
                        HttpWebRequest req = null;
                        HttpWebResponse resp = null;
                        Stream istrm = null;
                        StreamReader rdr = null;

                        try
                        {
                            req = (HttpWebRequest)WebRequest.Create(strUrl);
                            strUrl = null;      // disallow further use of this URI 
                            resp = (HttpWebResponse)req.GetResponse();
                            istrm = resp.GetResponseStream();
                            rdr = new StreamReader(istrm);
                        }
                        catch (Exception ex)
                        {
                            rdr = null;
                            Console.WriteLine("Error: req: " + ex.Message);
                        }

                        if (rdr != null)
                        {
                            // Read in the entire page. 
                            pageData = rdr.ReadToEnd();
                            curloc = 0;

                            do
                            {
                                // Find the next URI to link to. 
                                link = null;
                                link = FindLink(pageData, ref curloc);

                                if (link != null)
                                {
                                    link = link.Trim();
                                    //if (!link.Contains("http://"))
                                    //{
                                    //    link = baseUrl + link;
                                    //}

                                    if (!(link.Contains(".pdf") || link.Contains(".jpg") || link.Contains(".gif") || link.Contains(".png") || link.Contains(".swf") || link.Contains(".js") || link.Contains(".css") || link.Contains(".xml") || link.Contains(".xls") || link.Contains(".doc")))
                                    {
                                        if (link.Contains(websitename))
                                        {
                                            bool addLink = true;

                                            if (alUrls.Contains(link))
                                            {
                                                addLink = false;
                                            }
                                            if (alUrls.Contains(link + "/"))
                                            {
                                                addLink = false;
                                            }
                                            if (alUrls.Contains("*spidered::" + link))
                                            {
                                                addLink = false;
                                            }
                                            if (alUrls.Contains("*spidered::" + link + "/"))
                                            {
                                                addLink = false;
                                            }

                                            if (addLink)
                                            {
                                                // add links to the arraylist repository
                                                alUrls.Add(link);
                                            }
                                        }
                                    }
                                }
                                else
                                {
                                    //Console.WriteLine("No link found.");
                                    break;
                                }

                            } while (link.Length > 0);
                        }

                        // Close the Response. 
                        if (resp != null)
                        {
                            resp.Close();
                        }

                    } while (strUrl != null);
                }
                catch (WebException exc)
                {
                    Console.WriteLine("Error: Spidering: " + exc.Message);
                }
            }
        }

        // Find a link in a content string. 
        static string FindLink(string htmlstr, ref int startloc)
        {
            int i;
            int start, end;
            string uri = null;
            string lowcasestr = htmlstr.ToLower();

            // todo :: use href for robust spidering
            //i = lowcasestr.IndexOf("href=", startloc);

            i = lowcasestr.IndexOf("href=\"http", startloc);
            if (i != -1)
            {
                start = htmlstr.IndexOf('"', i) + 1;
                end = htmlstr.IndexOf('"', start);
                uri = htmlstr.Substring(start, end - start);
                startloc = end;
            }

            return uri;
        }

        // write to a log
        static void writeToLog()
        {
            alUrls.Sort();

            try
            {
                TextWriter tws = new StreamWriter("c:\\KJSpiderReport.txt");
                tws.WriteLine(timeStart);
                foreach (string str2 in alUrls)
                {
                    tws.WriteLine(str2);
                }
                tws.WriteLine(timeEnd);
                tws.Close();   // close the stream
            }
            catch (Exception ex)
            {
                Console.Write("Error: log: " + ex.Message);
            }

            Console.WriteLine("Spidering complete.");
            //Console.ReadLine();
        }
    }
}

coldfusion


CF Quick Reference


Ginger CMS
the future of cms, a simple, easy and intutive content management system ... more


CFTurbine
cf prototyping engine, generates boilerplate code and ... more


Jrun monitor
monitor and timely auto-restart to avoid Jrun hang ... more


Inheritance Config.
uses OOPs inheritance to create configuration file ... more


Real Estate App.
complete real estate application using data from MLS ... more


Search Engine Lite
create your own search engine for your web site ... more