Go back to Richel Bilderbeek's homepage.

Go back to Richel Bilderbeek's C++ page.

 

 

 

 

 

(C++) HtmlPage

 

HtmlPage is a class for processing an HTML page.

 

 

 

 

 

htmlpage.h

 

//---------------------------------------------------------------------------
/*
HtmlPage, HTML page class
Copyright (C) 2011 Richel Bilderbeek

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
//---------------------------------------------------------------------------
//From http://www.richelbilderbeek.nl/CppHtmlPage.htm
//---------------------------------------------------------------------------
#ifndef HTMLPAGE_H
#define HTMLPAGE_H
//---------------------------------------------------------------------------
#include <string>
#include <vector>
//---------------------------------------------------------------------------
struct HtmlPage
{
  HtmlPage(const std::string& filename);
  const std::string& GetFilename() const { return m_filename; }
  const std::string& GetTitle() const { return m_title; }

  private:
  std::string m_filename;
  std::string m_title;

  public:
  static bool FileExists(const std::string& filename);
  static const std::vector<std::string> FileToVector(const std::string& filename);
  static const std::string ReplaceAll(
    std::string s,
    const std::string& replaceWhat,
    const std::string& replaceWithWhat);

};
//---------------------------------------------------------------------------
bool operator<(const HtmlPage& lhs, const HtmlPage& rhs);
//---------------------------------------------------------------------------
#endif // HTMLPAGE_H
//---------------------------------------------------------------------------

 

 

 

 

 

htmlpage.cpp

 

//---------------------------------------------------------------------------
/*
HtmlPage, HTML page class
Copyright (C) 2011 Richel Bilderbeek

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
//---------------------------------------------------------------------------
//From http://www.richelbilderbeek.nl/CppHtmlPage.htm
//---------------------------------------------------------------------------
#include <fstream>
#include <iostream>
//---------------------------------------------------------------------------
#include <boost/foreach.hpp>
#include <boost/regex.hpp>
#include <boost/algorithm/string.hpp>
//---------------------------------------------------------------------------
#include "htmlpage.h"
//---------------------------------------------------------------------------
HtmlPage::HtmlPage(const std::string& filename)
  : m_filename(filename)
{
  assert(FileExists(filename));

  const boost::regex title_regex("<title>.*</title>");

  //Copy all filenames matching the regex in the resulting std::vector
  BOOST_FOREACH(const std::string& s, FileToVector(filename))
  {

    if (boost::regex_search(s,title_regex))
    {
      std::string t = s;
      //Trim leading whitespace
      while (!std::isgraph(t[0])) t = t.substr(1,t.size() - 1);
      //Trim trailing whitespace
      while (!std::isgraph(t[t.size()-1])) t.resize(t.size() - 1);
      //Extract title
      assert(t.substr(0,7)=="<title>");
      assert(t.substr(t.size()-8,8)=="</title>");
      m_title = t.substr(7,t.size()-8-7);
      m_title = ReplaceAll(m_title,"&amp;","&");

    }
  }
}
//---------------------------------------------------------------------------
///FileExists checks if a certain file exists
///From http://www.richelbilderbeek.nl/CppFileExists.htm
bool HtmlPage::FileExists(const std::string& filename)
{
  std::fstream f;
  f.open(filename.c_str(),std::ios::in);
  return f.is_open();
}
//---------------------------------------------------------------------------
///FileToVector reads a file and converts it to a std::vector<std::string>
///From http://www.richelbilderbeek.nl/CppFileToVector.htm
const std::vector<std::string> HtmlPage::FileToVector(const std::string& filename)
{
  assert(FileExists(filename));
  std::vector<std::string> v;
  std::ifstream in(filename.c_str());
  std::string s;
  for (int i=0; !in.eof(); ++i)
  {
    std::getline(in,s);
    v.push_back(s);
  }
  return v;
}
//---------------------------------------------------------------------------
//From http://www.richelbilderbeek.nl/CppReplaceAll.htm
const std::string HtmlPage::ReplaceAll(
  std::string s,
  const std::string& replaceWhat,
  const std::string& replaceWithWhat)
{
  while(1)
  {
    const int pos = s.find(replaceWhat);
    if (pos==-1) break;
    s.replace(pos,replaceWhat.size(),replaceWithWhat);
  }
  return s;
}
//---------------------------------------------------------------------------
bool operator<(const HtmlPage& lhs, const HtmlPage& rhs)
{
  //Case insensitive compare
  return boost::algorithm::to_lower_copy(lhs.GetTitle())
       < boost::algorithm::to_lower_copy(rhs.GetTitle());
}
//---------------------------------------------------------------------------

 

 

 

 

 

Go back to Richel Bilderbeek's C++ page.

Go back to Richel Bilderbeek's homepage.

 

Valid XHTML 1.0 Strict

This page has been created by the tool CodeToHtml