Use a more robust scheme to separate the scheme, hostname, and filename in URLParse()

2021-04-20 02:39:41 +03:00 · 2021-04-20 02:39:41 +03:00 · eb9111646e
parent 3680d14485
commit eb9111646e
1 changed files with 34 additions and 64 deletions
--- a/common/url.c
+++ b/common/url.c
@ -404,8 +404,7 @@ MemPool mp;
 char *url;
 {
  URLParts *up;
-  char *start;
-  char *colon, *slash, *fslash;
+  char *cursor;
  char *pound; /* link pound (#) sign */
  char *at; /* username/password @ */
  char *ucolon; /* username colon */
@ -414,83 +413,49 @@ char *url;
  up = URLCreate(mp);

  /* skip leading white-space (if any)*/
-  for (start = url; isspace8(*start); start++)
+  for (cursor = url; isspace8(*cursor); cursor++)
      ;

-  /*
-   * Look for indication of a scheme.
-   */
-  colon = strchr(start, ':');
+  // Extract the scheme, if any
+  up->scheme = URLGetScheme(mp, cursor);

-  /*
-   * Search for characters that indicate the beginning of the
-   * path/params/query/fragment part.
-   */
-  slash = strchr(start, '/');
-  if (slash == NULL) slash = strchr(start, ';');
-  if (slash == NULL) slash = strchr(start, '?');
-  if (slash == NULL) slash = strchr(start, '#');
-
-  /*
-   * Check to see if there is a scheme.  There is a scheme only if
-   * all other separators appear after the colon.
-   */
-  if (colon != NULL && (slash == NULL || colon < slash))
+  if (up->scheme)
  {
-    up->scheme = MPGet(mp, colon - start + 1);
-    strncpy(up->scheme, start, colon - start);
-    up->scheme[colon - start] = '\0';
+    // Skip the scheme and the : that follows
+    // up->scheme contains the part before the :
+    // Therefore, its cursor + length == the position of the :
+    // We know that we have the : there, so skip that position
+    cursor += strlen(up->scheme) + 1;
  }

-  /*
-   * If there is a slash then sort out the hostname and filename.
-   * If there is no slash then there is no hostname but there is a
-   * filename.
-   */
-  if (slash != NULL)
+  // If we have scheme://, we have a hostname and filename
+  // Otherwise, only filename
+  if (up->scheme && cursor[0] == '/' && cursor[1] == '/')
  {
-    /*
-     * Check for leading //. If its there then there is a host string.
-     */
-    if ((*(slash + 1) == '/') && ((colon == NULL && slash == start) ||
-	(colon != NULL && slash == colon + 1)))
+    // Move the cursor after the //
+    cursor += 2;
+
+    // We know we have at least the hostname
+    // Do we also have a slash, marking the existence of filename?
+    const char *slash = strchr(cursor, '/');
+    if (slash != NULL)
    {
-      /*
-       * Check for filename at end of host string.
-       */
-      slash += 2;
-      if ((fslash = strchr(slash, '/')) != NULL)
-      {
-	up->hostname = MPGet(mp, fslash - slash + 1);
-	strncpy(up->hostname, slash, fslash - slash);
-	up->hostname[fslash - slash] = '\0';
-	up->filename = MPStrDup(mp, fslash);
-      }
-      else
-      { /* there is no filename */
-	up->hostname = MPStrDup(mp, slash);
-      }
+      // Yes, until the slash is hostname, after it the filename
+      up->hostname = MPGet(mp, slash - cursor + 1);
+      strncpy(up->hostname, cursor, slash - cursor);
+      up->hostname[slash - cursor] = '\0';
+      up->filename = MPStrDup(mp, slash);
    }
    else
    {
-      /*
-       * the rest is a filename because there is no // or it appears
-       * after other characters
-       */
-      if (colon != NULL && colon < slash)
-      {
-	up->filename = MPStrDup(mp, colon + 1);
-      }
-      else up->filename = MPStrDup(mp, start);
+      // No, the whole thing is a hostname
+      up->hostname = MPStrDup(mp, cursor);
    }
  }
  else
  {
-    /*
-     * No slashes at all so the rest must be a filename.
-     */
-    if (colon == NULL) up->filename = MPStrDup(mp, start);
-    else up->filename = MPStrDup(mp, colon + 1);
+    // No //, so this is all filename
+    up->filename = MPStrDup(mp, cursor);
  }

  /*
@ -597,14 +562,19 @@ char *url;
  {
    for (dp = URLDELIMS; *dp != '\0'; dp++)
    {
+      // Did we come across a URL delimiter?
      if (*cp == *dp)
      {
+        // Yes, was it ':'?
 	if (*cp == ':')
 	{
+	  // Yes it was, scheme is what was before it
 	  r = (char *)MPCGet(mp, cp - url + 1);
 	  strncpy(r, url, cp - url);
+	  r[cp - url] = '\0';
 	  return(r);
 	}
+	// No, it was something else. No scheme found.
 	return(NULL);
      }
    }