Overview
Examples
Screenshots
Comparisons
Applications
Download
Documentation
Tutorials
UppHub
Status & Roadmap
FAQ
Authors & License
Forums
Funding U++
Search on this site











SourceForge.net Logo

SourceForge.net Logo

GitHub Logo

Discord Logo

GuiWebCrawler

 

SocketWaitEvent and asynchronous HttpRequest in GUI environment

 

 

GuiWebCrawler.cpp

 

#include <CtrlLib/CtrlLib.h>

#include <Core/SSL/SSL.h>

 

using namespace Upp;

 

#define LAYOUTFILE <GuiWebCrawler/GuiWebCrawler.lay>

#include <CtrlCore/lay.h>

 

struct WebCrawler : public WithCrawlerLayout<TopWindow> {

    BiVector<int>          todo;

    VectorMap<String, int> url;

    

    struct Work {

        HttpRequest http;

        int         urli;

    };

    Array<Work>      http;

    int64            total;

    

    void ExtractUrls(const String& html, int srci);

    void ShowPath();

    void OpenURL(ArrayCtrl& a);

 

public:

    void Run();

 

    WebCrawler();

};

 

bool IsUrlChar(int c)

{

    return c == ':' || c == '.' || IsAlNum(c) || c == '_' || c == '%' || c == '/';

}

 

void WebCrawler::ExtractUrls(const String& html, int srci)

{

    int q = 0;

    while(q < html.GetCount()) {

        int http = html.Find("http://", q);

        int https = html.Find("https://", q);

        q = min(http < 0 ? https : http, https < 0 ? http : https);

        if(q < 0)

            return;

        int b = q;

        while(q < html.GetCount() && IsUrlChar(html[q]))

            q++;

        String u = html.Mid(b, q - b);

        if(url.Find(u) < 0) {

            todo.AddTail(url.GetCount());

            url.Add(u, srci);

        }

    }

}

 

void WebCrawler::Run()

{

    String seed = "www.ultimatepp.org";

    if(!EditText(seed, "GuiWebSpider", "Seed URL"))

        return;

    todo.AddTail(0);

    url.Add(seed);

    Open();

    while(IsOpen()) {

        ProcessEvents();

        while(todo.GetCount() && http.GetCount() < 60) {

            int i = todo.Head();

            todo.DropHead();

            Work& w = http.Add();

            w.urli = i;

            w.http.Url(url.GetKey(i))

                  .UserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:11.0) Gecko/20100101 Firefox/11.0")

                  .Timeout(0);

            work.Add(url.GetKey(i));

            work.HeaderTab(0).SetText(Format("URL (%d)", work.GetCount()));

        }

        SocketWaitEvent we;

        for(int i = 0; i < http.GetCount(); i++)

            we.Add(http[i].http, http[i].http.GetWaitEvents());

        we.Wait(10);

        int i = 0;

        while(i < http.GetCount()) {

            Work& w = http[i];

            w.http.Do();

            String u = url.GetKey(w.urli);

            int q = work.Find(u);

            if(w.http.InProgress()) {

                if(q >= 0)

                    work.Set(q, 1, w.http.GetPhaseName());

                i++;

            }

            else {

                String html = w.http;

                total += html.GetCount();

                finished.Add(u, w.http.IsError() ? String().Cat() << w.http.GetErrorDesc()

                                                 : String().Cat() << w.http.GetStatusCode()

                                                   << ' ' << w.http.GetReasonPhrase()

                                                   << " (" << html.GetCount() << " bytes)",

                             w.urli);

                finished.HeaderTab(0).SetText(Format("Finished (%d)", finished.GetCount()));

                finished.HeaderTab(1).SetText(Format("Response (%` KB)", total >> 10));

                if(w.http.IsSuccess()) {

                    ExtractUrls(html, w.urli);

                    Title(AsString(url.GetCount()) + " URLs found");

                }

                http.Remove(i);

                work.Remove(q);

            }

        }

    }

}

 

void WebCrawler::ShowPath()

{

    path.Clear();

    if(!finished.IsCursor())

        return;

    int i = finished.Get(2);

    Vector<String> p;

    for(;;) {

        p.Add(url.GetKey(i));

        if(i == 0)

            break;

        i = url[i];

    }

    for(int i = p.GetCount() - 1; i >= 0; i--)

        path.Add(p[i]);

}

 

void WebCrawler::OpenURL(ArrayCtrl& a)

{

    String u = a.GetKey();

    WriteClipboardText(u);

    LaunchWebBrowser(u);

}

 

WebCrawler::WebCrawler()

{

    CtrlLayout(*this, "WebCrawler");

    work.AddColumn("URL");

    work.AddColumn("Status");

    finished.AddColumn("Finished");

    finished.AddColumn("Response");

    finished.WhenCursor = [=] { ShowPath(); };

    finished.WhenLeftDouble = [=] { OpenURL(finished); };

    path.AddColumn("Path");

    path.WhenLeftDouble = [=] { OpenURL(path); };

    total = 0;

    Zoomable().Sizeable();

}

 

GUI_APP_MAIN

{

    HttpRequest::Trace();

 

    WebCrawler().Run();

}

 

 

 

GuiWebCrawler.lay

 

LAYOUT(CrawlerLayout, 680, 508)

    ITEM(ArrayCtrl, work, LeftPosZ(4, 356).TopPosZ(4, 500))

    ITEM(ArrayCtrl, finished, LeftPosZ(364, 312).TopPosZ(4, 324))

    ITEM(ArrayCtrl, path, LeftPosZ(364, 312).TopPosZ(332, 172))

END_LAYOUT

 

 

 

 

 

Do you want to contribute?