Private Sub cmdStartCrawling_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles cmdStartCrawling.Click
If cmdStartCrawling.Text = "Start Crawling" Then
Application.DoEvents()
lblCrawlingStatus.Text = "Currently Crawling"
lblCrawlingStatus.ForeColor = Green
cmdStartCrawling.Text = "Stop Crawling"
Dim Connection As New OleDb.OleDbConnection
Connection.ConnectionString = "PROVIDER=Microsoft.Jet.OLEDB.4.0;Data Source = C:\database.mdb"
Connection.Open()
Dim myDataSet As New DataSet
Dim myDataAdapter As OleDb.OleDbDataAdapter
Dim sql As String
sql = "SELECT * FROM uris"
myDataAdapter = New OleDb.OleDbDataAdapter(sql, Connection)
myDataAdapter.Fill(myDataSet, "uris")
Dim myCommandBuilder As New OleDb.OleDbCommandBuilder(myDataAdapter)
Dim myDataSetNewRow As DataRow
myCommandBuilder.GetUpdateCommand()
Dim i As Integer = 0
While i < 10
Dim CurrentURI As String = myDataSet.Tables("uris").Rows(i).Item(1)
Dim CurrentSource As String = ""
Try
CurrentSource = GetURL(CurrentURI)
lblCrawlingCurrentlyData.Text = CurrentURI
Catch ex As Exception
MsgBox("Can't load Web page" & vbCrLf & ex.Message)
End Try
Dim MyHTMLURI As Match
Dim myMatchesURI As MatchCollection
Dim myRegexURI As New Regex( _
"<A[^>]*?HREF\s*=\s*""([^""]+)""[^>]*?>([\s\S]*?)<\/A>", _
RegexOptions.IgnoreCase)
Dim myRegexURI2 As New Regex("href\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))", _
RegexOptions.IgnoreCase Or RegexOptions.Compiled)
Dim successfulMatchURI As Match
myMatchesURI = myRegexURI.Matches(CurrentSource)
Dim URISStolen As Integer = 0
For Each successfulMatchURI In myMatchesURI
MyHTMLURI = myRegexURI2.Match(successfulMatchURI.Value)
Dim HTMLURIRemove As String
HTMLURIRemove = MyHTMLURI.ToString.Remove(0, 6)
HTMLURIRemove = HTMLURIRemove.Remove(HTMLURIRemove.Length - 1, 1)
If HTMLURIRemove.StartsWith("http://") Then
myDataSetNewRow = myDataSet.Tables("uris").NewRow
If HTMLURIRemove.Length > 255 Then
HTMLURIRemove = HTMLURIRemove.Remove(250, HTMLURIRemove.Length - 250)
End If
If CurrentURI.Length > 255 Then
CurrentURI.Remove(250, CurrentURI.Length - 200)
End If
myDataSetNewRow.Item("uri") = HTMLURIRemove
myDataSetNewRow.Item("parentPage") = CurrentURI
myDataSetNewRow.Item("dateAdded") = My.Computer.Clock.LocalTime
myDataSet.Tables("uris").Rows.Add(myDataSetNewRow)
myDataAdapter.Update(myDataSet, "uris")
URISStolen = URISStolen + 1
lblCrawlingUriData.Text = URISStolen.ToString
Else
End If
Next
i = i + 1
End While
Connection.Close()
Else
lblCrawlingStatus.Text = "Currently Not Crawling"
lblCrawlingStatus.ForeColor = Red
cmdStartCrawling.Text = "Start Crawling"
End If
End Sub