Wednesday, August 10, 2005

Here's the Google page parser

Here's v1.0 of the code...


Sub Parse_Google_Search_Results_Pages_For_URLs()

'**********************************************************
'****************** Code by Excel_Geek ********************
'**********************************************************

' This Excel macro calls up search results pages of a Google search
' using a search term specified in a cell named "SEARCH_TERM". Then it uses a
' regular expression to glean from this page all of the webpage addresses (URLs)
' of the search results and places them in a list in the spreadsheet.

' Setting up the regular expression that recognizes search result URLS on Google pages --
' The regular expression also ingores case, which by the way, i convert all to lower in the end.

Set regEx = CreateObject("VBScript.regexp")
regEx.Global = True
regEx.IgnoreCase = True
regEx.Pattern = """http://([^6""]+)"""

' Now I reformat the search term as typed into the search term as used in Google search query URLs.
' i.e. I replace spaces with pluses and quote marks with %22s.

Range("SEARCH_TERM").Select
Selection.Replace What:="""", Replacement:="%22", LookAt:=xlPart, _
SearchOrder:=xlByRows, MatchCase:=False, SearchFormat:=False, _
ReplaceFormat:=False
Selection.Replace What:=" ", Replacement:="+", LookAt:=xlPart, SearchOrder _
:=xlByRows, MatchCase:=False, SearchFormat:=False, ReplaceFormat:=False

' Set the top limit on the Google page of results to parse.
PAGES = Range("GOOGLE_PAGE").Value

' Need this, apparently, to generate random numbers...
' which we'll need at the bottom of this next FOR_NEXT...

Math.Randomize

' Create a FOR_NEXT loop to get all pages.
For x = 1 To PAGES
' Set the GOOGLE_PAGE value to the page to parse.
Range("GOOGLE_PAGE").Select
ActiveCell.FormulaR1C1 = x

' Selected the SEARCH_URL, as this will need to be passed to the objHTTP.Open command.
Range("SEARCH_URL").Select

' Get the page using the objHTTP object.
Set objHTTP = CreateObject("MSXML2.ServerXMLHTTP")
objHTTP.Open "GET", Selection.Value, False
objHTTP.setRequestHeader "User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)"
objHTTP.send ("")

' Define the pageToParse as teh response text of the "GET" method.
pageToParse = objHTTP.responseText
' Clear the objHTTP object.
Set objHTTP = Nothing
' Set expressionmatch as the strings matched using the regular expression defined above.
Set expressionmatch = regEx.Execute(pageToParse)

For Each expressionmatched In expressionmatch

' Got to get rid of any results on Google's servers.
If InStr(1, expressionmatched, "google", 1) > 0 Then

GoTo MOVE_ON

Else

Range("FIRST_URL_HERE").Select
ActiveCell.FormulaR1C1 = LCase(Left(Mid((expressionmatched), 2, 300), Len(expressionmatched) - 2))
Selection.Offset(1, 0).Select
ActiveWorkbook.Names.Add Name:="FIRST_URL_HERE", RefersToR1C1:=Selection

End If
MOVE_ON:

Next

' Setting up code to pause by a random amount of time between 4 and 15 seconds between pages of results.
Dim newHour, newMinute, newSecond, waitTime
newHour = Hour(Now())
newMinute = Minute(Now())
newSecond = Second(Now()) + Int((15 - 4 + 1) * Math.Rnd + 4)
waitTime = TimeSerial(newHour, newMinute, newSecond)
Application.Wait waitTime

Next x

End Sub

1 comment:

Anonymous said...

Thanks for this code



--------------------
dj