Use a more robust scheme to separate the scheme, hostname, and filename in URLParse()

2021-04-20 02:39:41 +03:00 · 2021-04-20 02:39:41 +03:00 · eb9111646e
parent 3680d14485
commit eb9111646e
1 changed files with 34 additions and 64 deletions
--- a/common/url.c
+++ b/common/url.c
@ -404,8 +404,7 @@ MemPool mp;
 char *url;
 {
  URLParts *up;
-  char *start;
+  char *cursor;
  char *colon, *slash, *fslash;
  char *pound; /* link pound (#) sign */
  char *at; /* username/password @ */
  char *ucolon; /* username colon */
@ -414,83 +413,49 @@ char *url;
  up = URLCreate(mp);
  /* skip leading white-space (if any)*/
-  for (start = url; isspace8(*start); start++)
+  for (cursor = url; isspace8(*cursor); cursor++)
      ;
-  /*
+  // Extract the scheme, if any
-   * Look for indication of a scheme.
+  up->scheme = URLGetScheme(mp, cursor);
   */
  colon = strchr(start, ':');
-  /*
+  if (up->scheme)
   * Search for characters that indicate the beginning of the
   * path/params/query/fragment part.
   */
  slash = strchr(start, '/');
  if (slash == NULL) slash = strchr(start, ';');
  if (slash == NULL) slash = strchr(start, '?');
  if (slash == NULL) slash = strchr(start, '#');
  /*
   * Check to see if there is a scheme.  There is a scheme only if
   * all other separators appear after the colon.
   */
  if (colon != NULL && (slash == NULL || colon < slash))
  {
-    up->scheme = MPGet(mp, colon - start + 1);
+    // Skip the scheme and the : that follows
-    strncpy(up->scheme, start, colon - start);
+    // up->scheme contains the part before the :
-    up->scheme[colon - start] = '\0';
+    // Therefore, its cursor + length == the position of the :
    // We know that we have the : there, so skip that position
    cursor += strlen(up->scheme) + 1;
  }
-  /*
+  // If we have scheme://, we have a hostname and filename
-   * If there is a slash then sort out the hostname and filename.
+  // Otherwise, only filename
-   * If there is no slash then there is no hostname but there is a
+  if (up->scheme && cursor[0] == '/' && cursor[1] == '/')
   * filename.
   */
  if (slash != NULL)
  {
-    /*
+    // Move the cursor after the //
-     * Check for leading //. If its there then there is a host string.
+    cursor += 2;
-     */
+
-    if ((*(slash + 1) == '/') && ((colon == NULL && slash == start) ||
+    // We know we have at least the hostname
-	(colon != NULL && slash == colon + 1)))
+    // Do we also have a slash, marking the existence of filename?
    const char *slash = strchr(cursor, '/');
    if (slash != NULL)
    {
-      /*
+      // Yes, until the slash is hostname, after it the filename
-       * Check for filename at end of host string.
+      up->hostname = MPGet(mp, slash - cursor + 1);
-       */
+      strncpy(up->hostname, cursor, slash - cursor);
-      slash += 2;
+      up->hostname[slash - cursor] = '\0';
-      if ((fslash = strchr(slash, '/')) != NULL)
+      up->filename = MPStrDup(mp, slash);
      {
 	up->hostname = MPGet(mp, fslash - slash + 1);
 	strncpy(up->hostname, slash, fslash - slash);
 	up->hostname[fslash - slash] = '\0';
 	up->filename = MPStrDup(mp, fslash);
      }
      else
      { /* there is no filename */
 	up->hostname = MPStrDup(mp, slash);
      }
    }
    else
    {
-      /*
+      // No, the whole thing is a hostname
-       * the rest is a filename because there is no // or it appears
+      up->hostname = MPStrDup(mp, cursor);
       * after other characters
       */
      if (colon != NULL && colon < slash)
      {
 	up->filename = MPStrDup(mp, colon + 1);
      }
      else up->filename = MPStrDup(mp, start);
    }
  }
  else
  {
-    /*
+    // No //, so this is all filename
-     * No slashes at all so the rest must be a filename.
+    up->filename = MPStrDup(mp, cursor);
     */
    if (colon == NULL) up->filename = MPStrDup(mp, start);
    else up->filename = MPStrDup(mp, colon + 1);
  }
  /*
@ -597,14 +562,19 @@ char *url;
  {
    for (dp = URLDELIMS; *dp != '\0'; dp++)
    {
      // Did we come across a URL delimiter?
      if (*cp == *dp)
      {
        // Yes, was it ':'?
 	if (*cp == ':')
 	{
 	  // Yes it was, scheme is what was before it
 	  r = (char *)MPCGet(mp, cp - url + 1);
 	  strncpy(r, url, cp - url);
 	  r[cp - url] = '\0';
 	  return(r);
 	}
 	// No, it was something else. No scheme found.
 	return(NULL);
      }
    }