Domanda

Sto cercando di includere questa funzione curl nella mia classe ma ho problemi con CURLOPT_WRITEFUNCTION.Seguire la compilazione non mi ha trovato una soluzione.Ho anche provato alcune cose basate su StackOverflow per non essere disponibili.

Ecco il mio tentativo (sostituendo 'writer' in questo codice)

nodo :: writer & node :: writer std :: bind1st (std :: mem_fun (& node :: writer), questo);

Ecco il mio codice:

#ifndef NODE_H_
#define NODE_H_

int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer);

/*
 * function prototypes
 */

class node {
 /*
  * general struct to hold html element properties
  */
 struct tag;

 /*
  * the url and source of the page
  */
 std::string url;
 std::string source;

 /*
  *  vector of structures that store tag elements
  */
 std::vector<tag> heading;
 std::vector<tag> anchor;

 /*
  * grab source with curl
  */
 std::string curlHttpget(const std::string &url);

 /*
  * add tag structs to vector
  * @see std::vector<tag> heading
  * @see std::vector<tag> anchor
  */
 void add_heading(std::string, std::string);
 void add_anchor(std::string, std::string);

public:
 /*
  * constructors
  */
 node(){}
 node(std::string);

 /*
  * deconstructors
  */
 ~node(){}

 /*
  * crawl page
  */
 void load(std::string seed);//crawls the page

 /*
  * anchor tags
  */
 void get_anchors();// scrape the anchor tags
 void display_anchors();

 /*
  * heading tags
  */
 void get_headings();// scrape heading tags
 void display_headings();
};
/*
 * for all stored html elements
 */
struct node::tag {
 std::string text;
 std::string properties;
 tag(std::string t, std::string p) : text(t), properties(p) {}
};

/*
 * constructors
 */
node::node(std::string seed) {
 load(seed);
 get_anchors();
 get_headings();
}
/*
 * araneus::subroutines
 */

// crawl the page
void node::load(std::string seed) {
 url = seed;
 source = curlHttpget(url);
}


//scrape html source
std::string node::curlHttpget(const std::string &url) {
 std::string buffer;

 CURL *curl;
 CURLcode result;

 curl = curl_easy_init();

 if (curl) {
  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
  curl_easy_setopt(curl, CURLOPT_HEADER, 0);
  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);

  result = curl_easy_perform(curl);//http get performed

  curl_easy_cleanup(curl);//must cleanup

  //error codes: http://curl.haxx.se/libcurl/c/libcurl-errors.html
  if (result == CURLE_OK) {
   return buffer;
  }
  //curl_easy_strerror was added in libcurl 7.12.0
  std::cerr << "error: " << result << " " << curl_easy_strerror(result) << std::endl;
  return "";
 }

 std::cerr << "error: could not initalize curl" << std::endl;
 return "";
}

void node::get_headings() {
 static const regex expression("<[hH][1-6]\\s*(?<properties>.*?)\\s*>(?<name>.*?)</\\s*[hH][1-6]\\s*>");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text;
 string properties;

 int count = 0;
 for (;p != end; count++, ++p)
 {
  string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_heading(text, properties);
  }
  else {
   properties = m;
  }
 }
}

//use regex to find anchors in source
void node::get_anchors() {
 static const regex expression("<[a|A].*?[href|HREF]\\s*=[\"\"'](?<url>.*?)[\"\"'].*?>(?<name>.*?)</[a|A]>");
 static const regex relative("^\\/");
 static const regex firstChar("^[A-Za-z0-9\\-_\\$\\.\\+!\\*'\\(\\)#]"); // valid url characters
 static const regex protocol("^[http:\\/\\/|HTTP:\\/\\/|https:\\/\\/|HTTPS:\\/\\/|ftp:\\/\\/|FTP:\\/\\/|sftp:\\/\\/|SFTP:\\/\\/]");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text, properties;

 int count = 0;
 for (; p != end; count++, ++p) {
  std::string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_anchor(text, properties);
  }
  else {
   if(regex_search(m, relative)) { //if link is in "/somewhere" format
    properties = url + m;
   }
   else if(regex_search(m, protocol)) { //if link is absolute "http://www.somewhere.com"
    properties = m;
   }
   else if(regex_search(m, firstChar)) { //if link starts with a valid url char "somewhere.html"
    properties = url + "/" + m;
   }
   else {
    std::cout << "link of unknown protocol: " << m << std::endl;
   }
  }
 }
}

void node::add_heading(std::string text, std::string properties) {
 heading.push_back(tag(text, properties));
}

void node::display_headings() {
 for(int i = 0; i < (int)heading.size(); i++) {
  std::cout<< "[h]: " << heading[i].text << endl;
  std::cout<< "[h.properties]: " << heading[i].properties << endl;
 }
 cout << "found " << (int)heading.size() << " <h[1-6]> tags" << endl;
}

void node::add_anchor(std::string text, std::string properties) {
 anchor.push_back(tag(text, properties));
}

void node::display_anchors() {
 for(int i = 0; i < (int)anchor.size(); i++) {
  std::cout<< "[a]: " << anchor[i].text << endl;
  std::cout<< "[a.properties]: " << anchor[i].properties << endl;
 }
 cout << "found " << (int)anchor.size() << " <a> tags" << endl;
}

//required by libcurl
int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer) {
 int result = 0;

 if (buffer != NULL) {
  buffer->append(data, size * nmemb);
  result = size * nmemb;
 }
 return result;
}

#endif /* NODE_H_ */

alla ricerca di una soluzione per far sì che la funzione 'int writer' sia "int node::writer".il problema si verifica in std::string node::curlHttpget, quando chiamo CURLOPT_WRITEFUNCTION.

&node::writer compila ma dà un errore di seg =/

Grazie

È stato utile?

Soluzione

Invece di usare std::string* usa node* come parametro o un'altra classe come HttpGet che ha un std::string e un puntatore al tuo nodo in modo che possa scrivere sulla stringa e accedere al tuo nodo ad ogni chiamata.

boost::bind non funzionerà per i callback C-API.

Si compila perché curl_easy_setopt utilizza ...quindi non è assolutamente sicuro per i tipi.Puoi passarlo di qualsiasi tipo desideri sotto il sole e verrà compilato.Probabilmente però non funzionerà, come hai scoperto a tue spese.

Vorrei optare per la sicurezza extra del tipo di fare in modo che la tua funzione abbia esattamente la stessa firma di Curl_write_callback, ad es.void* come quarto parametro ed eseguire il casting nell'implementazione della funzione.

Autorizzato sotto: CC-BY-SA insieme a attribuzione
Non affiliato a StackOverflow
scroll top