Programmatically Acquiring WebPages

WebPage illustrates how to Programmatically acquire a web page. Several current pieces of equipment give access to status and other information, via embedded webpages. Several vendors of embedded controllers, including Rabbit Semiconductor and NetBurner supply HTTP libraries with their systems in order to make web pages viable user interfaces. For instance, my Brother HL2070N Networked Laser Printer and my Motorola 3347 DSL Modem both have web page interfaces.

WebPage.cpp

/************************************************************************************************
*                                                                                               *
*  Copyright (c) 2006 - 2010 David Cass Tyler, PO Box 1026, Willard, NM 87063, (505) 384-5342   *
*                                                                                               *
*  WebPage.cpp calls GetWebPage() for three different URLs and returns the following to stderr: *
*                                                                                               *
*     Succeeded - "http://laptop" is not a Complete Directory Specifier:                        *
*                                                                                               *
*     HTTP/1.1 200 OK                                                                           *
*     X-Powered-By: ASP.NET                                                                     *
*     Content-Location: http://laptop/Default.htm                                               *
*     Content-Type: text/html                                                                   *
*     ETag: "788e773837b6c81:dfe"                                                               *
*     Content-Length: 614                                                                       *
*     Last-Modified: Thu, 15 May 2008 02:56:06 GMT                                              *
*                                                                                               *
*     Succeeded - "http://laptop/SCADA/" is a Directory but Not Fully Qualified:                *
*                                                                                               *
*     HTTP/1.1 200 OK                                                                           *
*     X-Powered-By: ASP.NET                                                                     *
*     Content-Location: http://laptop/SCADA/Default.htm                                         *
*     Content-Type: text/html                                                                   *
*     ETag: "c8935e555776cb1:df6"                                                               *
*     Content-Length: 13084                                                                     *
*     Last-Modified: Thu, 28 Oct 2010 04:19:37 GMT                                              *
*                                                                                               *
*     Succeeded - "http://laptop/SCADA/default.htm" is Fully Qualified:                         *
*                                                                                               *
*     HTTP/1.1 200 OK                                                                           *
*     X-Powered-By: ASP.NET                                                                     *
*     Content-Type: text/html                                                                   *
*     ETag: "c8935e555776cb1:dfe"                                                               *
*     Content-Length: 13084                                                                     *
*     Last-Modified: Thu, 28 Oct 2010 04:19:37 GMT                                              *
*                                                                                               *
*  The text only contents of the web page are written to stdout.                                *
*                                                                                               *
*  From the Command Line, compile as:                                                           *
*                                                                                               *
*     cl /MT /GX WebPage.cpp                                                                    *
*                                                                                               *
*  Tabs are set at 3.                                                                           *
*                                                                                               *
************************************************************************************************/

#include <AFXINET.H>                   // Internet Classes, Prototypes and Defines

// Dump a String and Show the Tabs, Carriage Returns and Line Feeds

void DumpString ( LPCSTR strString )

   {

   for ( unsigned long ixChar = 0; strString [ ixChar ]; ixChar++ )
      {
      if       ( strString [ ixChar ] == '\t' ) fprintf ( stdout, "\\t" );
      else if  ( strString [ ixChar ] == '\r' ) fprintf ( stdout, "\\r" );
      else if  ( strString [ ixChar ] == '\n' ) fprintf ( stdout, "\\n\n" );
      else                                      fprintf ( stdout, "%c", strString [ ixChar ] );
      }

   }  // void DumpString ( LPCSTR strString )

// Return TRUE if cChar is in szList

bool InThisList ( char cChar, char szList [] )

   {

   // Check the Character Against each Character in the List

   if ( cChar ) while ( szList [ 0 ] && cChar != szList [ 0 ] ) szList++;

   // Indicate Whether or Not We Found It

   return cChar == szList [ 0 ];

   }  // bool InThisList ( char cChar, char szList [] )

// Gobble Up Multiple Spaces and, after a New Line, Suppress Other White Space

void GobbleWhiteSpace (  LPTSTR & pszBuf )

   {
   LPTSTR pszIpt = pszBuf;
   LPTSTR pszOpt = pszBuf;

   // Gobble Up Leading Spaces, Carriage Returns, New Lines and Tabs

   while ( InThisList ( pszIpt [ 0 ], " \r\n\t" ) ) pszIpt++;

   // Process Each Character In Turn

   while ( pszIpt [ 0 ] != '\0' )
      {

      if ( pszIpt [ 0 ] == '\t' ) pszIpt [ 0 ] = ' ';
      if ( pszIpt [ 0 ] == '\r' ) pszIpt [ 0 ] = '\n';

      * pszOpt = * pszIpt;

      pszIpt++;
      pszOpt++;

      // Gobble Up Spaces and Tabs

      if ( pszIpt [ -1 ] == ' ' )
         {

         // Replace Multiple White Space Characters with a Single Space

         while ( InThisList ( pszIpt [ 0 ], " \t" ) ) pszIpt++;

         // Change "SP/CR" to "CR"

         if ( pszIpt [ 0 ] == '\n' ) pszOpt--;
         }

      // Gobble Up Line Feeds

      if ( pszIpt [ -1 ] == '\n' )
         {

         // Replace ALL of the White Space Following a New Line with Just the New Line

         while ( InThisList ( pszIpt [ 0 ], " \r\n\t" ) ) pszIpt++;  // Gobble Up New Lines, Spaces and Tabs
         }

      }  // End - Process Each Character In Turn

      pszOpt [ 0 ] = '\0';

   }  // void GobbleWhiteSpace (  LPTSTR & pszBuf )

// Strip the HTML Tags, - i.e. '<' Through '>' and Remove Unnecessary White Space

void StripTags ( CString & strBuffer )

   {
   LPTSTR pszBuf = strBuffer.LockBuffer();

   LPTSTR pszIpt = pszBuf;
   LPTSTR pszOpt = pszBuf;

   // Move the Starting Point Across Leading White Space - i.e. Spaces, Carriage Returns, New Lines and Tabs

   while ( InThisList ( pszIpt [ 0 ], " \r\n\t" ) ) pszIpt++;

   // Process All Input Bytes

   while ( pszIpt [ 0 ] != '\0' )
      {

      // Gobble Up Tags - i.e. Everything Between '<' and '>'

      while ( pszIpt [ 0 ] == '<' )
         {
         while ( pszIpt [ 0 ] && pszIpt [ 0 ] != '>'  ) pszIpt++;
         if    ( pszIpt [ 0 ] == '>'                  ) pszIpt++;

         // Add a Space Where the Tag Was

         pszOpt [ 0 ] = ' ';
         pszOpt++;
         }

      // Copy the Input to the Output and Replace Tabs with Spaces and Carriage Returns with New Lines

      if       ( pszIpt [ 0 ] == '\t' )   pszOpt [ 0 ] = ' ';
      else if  ( pszIpt [ 0 ] == '\r' )   pszOpt [ 0 ] = '\n';
      else                                pszOpt [ 0 ] = pszIpt [ 0 ];

      // Increment the Input and Output Pointers

      pszIpt++;  pszOpt++;

      }  // End - Process All Input Bytes

   // Terminate the Output String

   pszOpt [ 0 ] = '\0';

   // Clean Up the White Space

   GobbleWhiteSpace ( pszBuf );

   // Unlock the CString Buffer and Trim the String Buffer

   strBuffer.UnlockBuffer();
   strBuffer.FreeExtra();

   }  // void StripTags ( CString & strBuffer )

// Replace "\r\r\n" (CR/CR/LF) and "\r\n" (CR/LF) with '\n' (LF)

void FixHtmlLineEnds ( CString & strWebPage )

   {
   LPTSTR pszBuf = strWebPage.LockBuffer();

   LPTSTR pszIpt = pszBuf;
   LPTSTR pszOpt = pszBuf;

   // Process All Input Bytes

   while ( pszIpt [ 0 ] != '\0' )
      {

      // Change CR/CR/LF to LF

      if ( pszIpt [ 0 ] == '\r' && pszIpt [ 1 ] == '\r' && pszIpt [ 2 ] == '\n' )
         {
         pszOpt [ 0 ] = '\n';  pszIpt += 2;
         }

      // Process CR/LF to LF

      else if ( pszIpt [ 0 ] == '\r' && pszIpt [ 1 ] == '\n' )
         {
         pszOpt [ 0 ] = '\n';  pszIpt += 1;
         }

      // Process Everything Else

      else
         {
         pszOpt [ 0 ] = pszIpt [ 0 ];
         }

      // Increment the Input and Output Pointers

      pszIpt++;  pszOpt++;
      }

   // Terminate the Output Line

   pszOpt [ 0 ] = '\0';

   // Unlock the CString Buffer and Trim the String Buffer

   strWebPage.UnlockBuffer();
   strWebPage.FreeExtra();

   }  // void FixHtmlLineEnds ( CString & strWebPage )

// Get the Specified Web Page

bool GetWebPage ( LPCSTR pszWebPageURL, CString & strWebPage, CString & strRawHdrs )

   {
   CString        strServerName;
   CString        strObject;

   INTERNET_PORT  wPort;
   DWORD          dwServiceType;

   // Initialize for Failure Mode

   strRawHdrs.Empty();
   strWebPage.Empty();

   // Done if Not Parsable

   if ( ! AfxParseURL ( pszWebPageURL, dwServiceType, strServerName, strObject, wPort ) )
      return false;

   if ( dwServiceType != INTERNET_SERVICE_HTTP && dwServiceType != AFX_INET_SERVICE_HTTPS )
      return false;

   // Connect to the Server and Retrieve the Web Page

   static const char    szHttpHeaders[]      = "Accept:  text/*\r\n" "User-Agent:  DctLib\r\n";
   static const DWORD   dwHttpRequestFlags   = INTERNET_FLAG_EXISTING_CONNECT | INTERNET_FLAG_NO_AUTO_REDIRECT;

   CInternetSession     inetSession          ( "DctLib", PRE_CONFIG_INTERNET_ACCESS );

   CHttpConnection      * pHost              = NULL;
   CHttpFile            * pPage              = NULL;

   char                 szLine [ 1024 ]      = { 0 };
   DWORD                dwRet                = 0LU;

   // Request that GetWebPage() Catch Thrown CInternetException Exceptions

   try
      {
      pHost = inetSession.GetHttpConnection ( strServerName, wPort );
      pPage = pHost->OpenRequest ( CHttpConnection::HTTP_VERB_GET, strObject, NULL, 1, NULL, NULL, dwHttpRequestFlags );

      pPage->AddRequestHeaders   ( szHttpHeaders );
      pPage->SendRequest         ();

      // Get the Raw Headers for their Error Code Information

      pPage->QueryInfo           ( HTTP_QUERY_RAW_HEADERS_CRLF, strRawHdrs );
      pPage->QueryInfoStatusCode ( dwRet );

      // Append a Line at a Time to strWebPage

      while ( pPage->ReadString ( szLine, sizeof szLine - 1 ) ) strWebPage += szLine;
      }
   catch ( CInternetException * pEx )
      {
      char szErrorMessage [ 1024 ] = { 0 };
      pEx->GetErrorMessage ( szErrorMessage, sizeof szErrorMessage - 1 );
      fprintf ( stdout, "Error 0x%X:  \"%s\"\n\a", pEx->m_dwError, szErrorMessage );
      pEx->Delete();
      dwRet = 0LU;
      }

   if ( pPage != NULL ) { pPage->Close(); delete pPage; }
   if ( pHost != NULL ) { pHost->Close(); delete pHost; }

   inetSession.Close();

   FixHtmlLineEnds ( strRawHdrs );
   FixHtmlLineEnds ( strWebPage );

   return dwRet == HTTP_STATUS_OK;

   }  // bool GetWebPage ( LPCSTR pszWebPageURL, CString & strWebPage, CString & strRawHdrs )

// Test the Code

int main ( int /* argc *., char * /* argv */ [], char * /* envp */ [] )

   {
   CString  strRawHdrs;
   CString  strWebPage;

   bool     fResult;

   // The Following URL Does Not Indicate a Directory

   fResult = GetWebPage ( "http://laptop", strWebPage, strRawHdrs );
   fprintf ( stderr, "\n%s - \"http://laptop\" is not a Complete Directory Specifier:\n\n%s", fResult ? "Succeeded" : "Failed", strRawHdrs );

   // The Following URL Does Indicate a Directory but is Not Fully Qualified

   fResult = GetWebPage ( "http://laptop/SCADA/", strWebPage, strRawHdrs );
   fprintf ( stderr, "%s - \"http://laptop/SCADA/\" is a Directory but Not Fully Qualified:\n\n%s", fResult ? "Succeeded" : "Failed", strRawHdrs );

   // The Following URL Is Fully Qualified!

   fResult = GetWebPage ( "http://laptop/SCADA/default.htm", strWebPage, strRawHdrs );
   fprintf ( stderr, "%s - \"http://laptop/SCADA/default.htm\" is Fully Qualified:\n\n%s", fResult ? "Succeeded" : "Failed", strRawHdrs );

   // Dump the Web Page to Show the Embedded White Space

// DumpString ( strWebPage );

   // Remove the HTML Tags and Return the Just the Text

   StripTags ( strWebPage );

   // Print the Text from the Web Page

   fprintf ( stdout, "Web Page:\n\n%s", strWebPage );

   // Return Success!

   return 0;

   }  // int main ( int argc, char * argv [], char * envp [] )