Transmission UTF-8 names

Due to differences in file names encoding and length limit in Windows (255 UCS-2 characters = 510 bytes) and unix-like operating systems' filesystems (UTF-8 string limited by 255 bytes) long non-ASCII filenames valid for NTFS may be invalid for unixes. This patch allows autogeneration of shortened filenames that are always valid for unix filesystems (tested with ZFS).

Unfortunately I could not successfully explain this issue to transmission developers.

This patch makes transmission to generate shortend names for such files on the fly.

--- utils.c-1	2014-05-25 10:24:31.238401512 +0200
+++ utils.c	2014-05-25 10:24:31.139399184 +0200
@@ -368,6 +368,10 @@
   return 0;
 }
 
+#ifdef UTF8NAMES_DEBUG
+FILE *name_log=NULL;
+#endif
+
 char*
 tr_buildPath (const char *first_element, ...)
 {
@@ -377,6 +381,17 @@
   va_list vl;
   size_t bufLen = 0;
 
+#ifdef UTF8NAMES
+    /*size_t lastLen;*/
+    char *comp_start, *comp_end; // positions of current path component
+    unsigned char checksum = 0;
+#endif
+#ifdef UTF8NAMES_DEBUG
+    if( name_log == NULL) {
+        name_log = fopen("/tmp/name_debug.log", "wb");
+    }
+#endif
+
   /* pass 1: allocate enough space for the string */
   va_start (vl, first_element);
   element = first_element;
@@ -410,6 +425,58 @@
 
   /* sanity checks & return */
   assert (pch - buf == (off_t)bufLen);
+
+#ifdef UTF8NAMES
+  // walk all path records and shrink then to no more than 255 bytes
+  for(comp_start = buf;;) {
+    comp_end = strchrnul(comp_start, TR_PATH_DELIMITER); // points to separator or terminal zero
+    if (comp_end-comp_start > 255) {
+      // do shrink
+    /* if name_length>255
+    ... find start of extension, let it be a string end
+    ... cut from file name by excess length
+    ... if first char after cut is 0b10xxxxxx (UTF-8 continuation) cut more until start of UTF-8 character
+    */
+      size_t excess=comp_end-comp_start-255+2; /* 2 bytes for control sum */
+      char *extPos;
+      char *cutPos;
+#ifdef UTF8NAMES_DEBUG
+      fprintf(name_log, "long name at %d len %ld buflen %ld: %s\n", comp_start-buf, comp_end-comp_start, bufLen, comp_start);
+#endif
+      extPos=comp_end;
+      while( extPos>comp_start && *(extPos-1) != '.' ) { /* finish after dot or at string start */
+         extPos--;
+      }
+      if( extPos<(comp_start+excess+4) ) { /* +4 - reserve for utf-8 truncating */
+        /* too long extension or dot was not found, reset to end of string */
+        extPos=comp_end;
+      } else {
+        extPos--; /* go to point to dot */
+      }
+
+      cutPos = extPos  -excess;
+      while( cutPos>comp_start && ((*cutPos&0xC0)==0x80) ) { /* remove UTF-8 continuations */
+        cutPos--;
+      }
+      for( char *counter = cutPos; counter<=extPos; counter++ ) { // sum removed bytes
+        checksum ^= *counter;
+      }
+      sprintf( cutPos, "%02X", checksum );
+      {
+        char *x=cutPos+2, *y=extPos;
+        while (*x++=*y++); // copying overlapped strings
+      }
+      comp_end -= extPos-cutPos-2;
+#ifdef UTF8NAMES_DEBUG
+      fprintf( name_log, "Cutted %d at %d ext %d result: %s\n", excess, cutPos-buf, extPos-buf, buf );
+#endif
+    };
+
+    if (*comp_end == 0) break; // that was last
+    comp_start = comp_end+1; // set next component to next char after separator
+  };
+#endif
+
   return buf;
 }