Ruby  1.9.3p537(2014-02-19revision0)
string.c
Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   string.c -
00004 
00005   $Author$
00006   created at: Mon Aug  9 17:12:58 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009   Copyright (C) 2000  Network Applied Communication Laboratory, Inc.
00010   Copyright (C) 2000  Information-technology Promotion Agency, Japan
00011 
00012 **********************************************************************/
00013 
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "internal.h"
00018 #include <assert.h>
00019 
00020 #define BEG(no) (regs->beg[(no)])
00021 #define END(no) (regs->end[(no)])
00022 
00023 #include <math.h>
00024 #include <ctype.h>
00025 
00026 #ifdef HAVE_UNISTD_H
00027 #include <unistd.h>
00028 #endif
00029 
00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00031 
00032 #undef rb_str_new_cstr
00033 #undef rb_tainted_str_new_cstr
00034 #undef rb_usascii_str_new_cstr
00035 #undef rb_external_str_new_cstr
00036 #undef rb_locale_str_new_cstr
00037 #undef rb_str_new2
00038 #undef rb_str_new3
00039 #undef rb_str_new4
00040 #undef rb_str_new5
00041 #undef rb_tainted_str_new2
00042 #undef rb_usascii_str_new2
00043 #undef rb_str_dup_frozen
00044 #undef rb_str_buf_new_cstr
00045 #undef rb_str_buf_new2
00046 #undef rb_str_buf_cat2
00047 #undef rb_str_cat2
00048 
00049 static VALUE rb_str_clear(VALUE str);
00050 
00051 VALUE rb_cString;
00052 VALUE rb_cSymbol;
00053 
00054 #define RUBY_MAX_CHAR_LEN 16
00055 #define STR_TMPLOCK FL_USER7
00056 #define STR_NOEMBED FL_USER1
00057 #define STR_SHARED  FL_USER2 /* = ELTS_SHARED */
00058 #define STR_ASSOC   FL_USER3
00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00060 #define STR_ASSOC_P(s)  FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00061 #define STR_NOCAPA  (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00063 #define STR_UNSET_NOCAPA(s) do {\
00064     if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00065 } while (0)
00066 
00067 
00068 #define STR_SET_NOEMBED(str) do {\
00069     FL_SET((str), STR_NOEMBED);\
00070     STR_SET_EMBED_LEN((str), 0);\
00071 } while (0)
00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00074 #define STR_SET_EMBED_LEN(str, n) do { \
00075     long tmp_n = (n);\
00076     RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00077     RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00078 } while (0)
00079 
00080 #define STR_SET_LEN(str, n) do { \
00081     if (STR_EMBED_P(str)) {\
00082         STR_SET_EMBED_LEN((str), (n));\
00083     }\
00084     else {\
00085         RSTRING(str)->as.heap.len = (n);\
00086     }\
00087 } while (0)
00088 
00089 #define STR_DEC_LEN(str) do {\
00090     if (STR_EMBED_P(str)) {\
00091         long n = RSTRING_LEN(str);\
00092         n--;\
00093         STR_SET_EMBED_LEN((str), n);\
00094     }\
00095     else {\
00096         RSTRING(str)->as.heap.len--;\
00097     }\
00098 } while (0)
00099 
00100 #define RESIZE_CAPA(str,capacity) do {\
00101     if (STR_EMBED_P(str)) {\
00102         if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00103             char *tmp = ALLOC_N(char, (capacity)+1);\
00104             memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00105             RSTRING(str)->as.heap.ptr = tmp;\
00106             RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00107             STR_SET_NOEMBED(str);\
00108             RSTRING(str)->as.heap.aux.capa = (capacity);\
00109         }\
00110     }\
00111     else {\
00112         REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00113         if (!STR_NOCAPA_P(str))\
00114             RSTRING(str)->as.heap.aux.capa = (capacity);\
00115     }\
00116 } while (0)
00117 
00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00120 
00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00122 
00123 static inline int
00124 single_byte_optimizable(VALUE str)
00125 {
00126     rb_encoding *enc;
00127 
00128     /* Conservative.  It may be ENC_CODERANGE_UNKNOWN. */
00129     if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00130         return 1;
00131 
00132     enc = STR_ENC_GET(str);
00133     if (rb_enc_mbmaxlen(enc) == 1)
00134         return 1;
00135 
00136     /* Conservative.  Possibly single byte.
00137      * "\xa1" in Shift_JIS for example. */
00138     return 0;
00139 }
00140 
00141 VALUE rb_fs;
00142 
00143 static inline const char *
00144 search_nonascii(const char *p, const char *e)
00145 {
00146 #if SIZEOF_VALUE == 8
00147 # define NONASCII_MASK 0x8080808080808080ULL
00148 #elif SIZEOF_VALUE == 4
00149 # define NONASCII_MASK 0x80808080UL
00150 #endif
00151 #ifdef NONASCII_MASK
00152     if ((int)sizeof(VALUE) * 2 < e - p) {
00153         const VALUE *s, *t;
00154         const VALUE lowbits = sizeof(VALUE) - 1;
00155         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00156         while (p < (const char *)s) {
00157             if (!ISASCII(*p))
00158                 return p;
00159             p++;
00160         }
00161         t = (const VALUE*)(~lowbits & (VALUE)e);
00162         while (s < t) {
00163             if (*s & NONASCII_MASK) {
00164                 t = s;
00165                 break;
00166             }
00167             s++;
00168         }
00169         p = (const char *)t;
00170     }
00171 #endif
00172     while (p < e) {
00173         if (!ISASCII(*p))
00174             return p;
00175         p++;
00176     }
00177     return NULL;
00178 }
00179 
00180 static int
00181 coderange_scan(const char *p, long len, rb_encoding *enc)
00182 {
00183     const char *e = p + len;
00184 
00185     if (rb_enc_to_index(enc) == 0) {
00186         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00187         p = search_nonascii(p, e);
00188         return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00189     }
00190 
00191     if (rb_enc_asciicompat(enc)) {
00192         p = search_nonascii(p, e);
00193         if (!p) {
00194             return ENC_CODERANGE_7BIT;
00195         }
00196         while (p < e) {
00197             int ret = rb_enc_precise_mbclen(p, e, enc);
00198             if (!MBCLEN_CHARFOUND_P(ret)) {
00199                 return ENC_CODERANGE_BROKEN;
00200             }
00201             p += MBCLEN_CHARFOUND_LEN(ret);
00202             if (p < e) {
00203                 p = search_nonascii(p, e);
00204                 if (!p) {
00205                     return ENC_CODERANGE_VALID;
00206                 }
00207             }
00208         }
00209         if (e < p) {
00210             return ENC_CODERANGE_BROKEN;
00211         }
00212         return ENC_CODERANGE_VALID;
00213     }
00214 
00215     while (p < e) {
00216         int ret = rb_enc_precise_mbclen(p, e, enc);
00217 
00218         if (!MBCLEN_CHARFOUND_P(ret)) {
00219             return ENC_CODERANGE_BROKEN;
00220         }
00221         p += MBCLEN_CHARFOUND_LEN(ret);
00222     }
00223     if (e < p) {
00224         return ENC_CODERANGE_BROKEN;
00225     }
00226     return ENC_CODERANGE_VALID;
00227 }
00228 
00229 long
00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00231 {
00232     const char *p = s;
00233 
00234     if (*cr == ENC_CODERANGE_BROKEN)
00235         return e - s;
00236 
00237     if (rb_enc_to_index(enc) == 0) {
00238         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
00239         p = search_nonascii(p, e);
00240         *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00241         return e - s;
00242     }
00243     else if (rb_enc_asciicompat(enc)) {
00244         p = search_nonascii(p, e);
00245         if (!p) {
00246             if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00247             return e - s;
00248         }
00249         while (p < e) {
00250             int ret = rb_enc_precise_mbclen(p, e, enc);
00251             if (!MBCLEN_CHARFOUND_P(ret)) {
00252                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00253                 return p - s;
00254             }
00255             p += MBCLEN_CHARFOUND_LEN(ret);
00256             if (p < e) {
00257                 p = search_nonascii(p, e);
00258                 if (!p) {
00259                     *cr = ENC_CODERANGE_VALID;
00260                     return e - s;
00261                 }
00262             }
00263         }
00264         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00265         return p - s;
00266     }
00267     else {
00268         while (p < e) {
00269             int ret = rb_enc_precise_mbclen(p, e, enc);
00270             if (!MBCLEN_CHARFOUND_P(ret)) {
00271                 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00272                 return p - s;
00273             }
00274             p += MBCLEN_CHARFOUND_LEN(ret);
00275         }
00276         *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00277         return p - s;
00278     }
00279 }
00280 
00281 static inline void
00282 str_enc_copy(VALUE str1, VALUE str2)
00283 {
00284     rb_enc_set_index(str1, ENCODING_GET(str2));
00285 }
00286 
00287 static void
00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00289 {
00290     /* this function is designed for copying encoding and coderange
00291      * from src to new string "dest" which is made from the part of src.
00292      */
00293     str_enc_copy(dest, src);
00294     switch (ENC_CODERANGE(src)) {
00295       case ENC_CODERANGE_7BIT:
00296         ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00297         break;
00298       case ENC_CODERANGE_VALID:
00299         if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00300             search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00301             ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00302         else
00303             ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00304         break;
00305       default:
00306         if (RSTRING_LEN(dest) == 0) {
00307             if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00308                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00309             else
00310                 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00311         }
00312         break;
00313     }
00314 }
00315 
00316 static void
00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00318 {
00319     str_enc_copy(dest, src);
00320     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00321 }
00322 
00323 int
00324 rb_enc_str_coderange(VALUE str)
00325 {
00326     int cr = ENC_CODERANGE(str);
00327 
00328     if (cr == ENC_CODERANGE_UNKNOWN) {
00329         rb_encoding *enc = STR_ENC_GET(str);
00330         cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00331         ENC_CODERANGE_SET(str, cr);
00332     }
00333     return cr;
00334 }
00335 
00336 int
00337 rb_enc_str_asciionly_p(VALUE str)
00338 {
00339     rb_encoding *enc = STR_ENC_GET(str);
00340 
00341     if (!rb_enc_asciicompat(enc))
00342         return FALSE;
00343     else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00344         return TRUE;
00345     return FALSE;
00346 }
00347 
00348 static inline void
00349 str_mod_check(VALUE s, const char *p, long len)
00350 {
00351     if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00352         rb_raise(rb_eRuntimeError, "string modified");
00353     }
00354 }
00355 
00356 size_t
00357 rb_str_capacity(VALUE str)
00358 {
00359     if (STR_EMBED_P(str)) {
00360         return RSTRING_EMBED_LEN_MAX;
00361     }
00362     else if (STR_NOCAPA_P(str)) {
00363         return RSTRING(str)->as.heap.len;
00364     }
00365     else {
00366         return RSTRING(str)->as.heap.aux.capa;
00367     }
00368 }
00369 
00370 static inline VALUE
00371 str_alloc(VALUE klass)
00372 {
00373     NEWOBJ(str, struct RString);
00374     OBJSETUP(str, klass, T_STRING);
00375 
00376     str->as.heap.ptr = 0;
00377     str->as.heap.len = 0;
00378     str->as.heap.aux.capa = 0;
00379 
00380     return (VALUE)str;
00381 }
00382 
00383 static VALUE
00384 str_new(VALUE klass, const char *ptr, long len)
00385 {
00386     VALUE str;
00387 
00388     if (len < 0) {
00389         rb_raise(rb_eArgError, "negative string size (or size too big)");
00390     }
00391 
00392     str = str_alloc(klass);
00393     if (len > RSTRING_EMBED_LEN_MAX) {
00394         RSTRING(str)->as.heap.aux.capa = len;
00395         RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00396         STR_SET_NOEMBED(str);
00397     }
00398     else if (len == 0) {
00399         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00400     }
00401     if (ptr) {
00402         memcpy(RSTRING_PTR(str), ptr, len);
00403     }
00404     STR_SET_LEN(str, len);
00405     RSTRING_PTR(str)[len] = '\0';
00406     return str;
00407 }
00408 
00409 VALUE
00410 rb_str_new(const char *ptr, long len)
00411 {
00412     return str_new(rb_cString, ptr, len);
00413 }
00414 
00415 VALUE
00416 rb_usascii_str_new(const char *ptr, long len)
00417 {
00418     VALUE str = rb_str_new(ptr, len);
00419     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00420     return str;
00421 }
00422 
00423 VALUE
00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00425 {
00426     VALUE str = rb_str_new(ptr, len);
00427     rb_enc_associate(str, enc);
00428     return str;
00429 }
00430 
00431 VALUE
00432 rb_str_new_cstr(const char *ptr)
00433 {
00434     if (!ptr) {
00435         rb_raise(rb_eArgError, "NULL pointer given");
00436     }
00437     return rb_str_new(ptr, strlen(ptr));
00438 }
00439 
00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00441 #define rb_str_new2 rb_str_new_cstr
00442 
00443 VALUE
00444 rb_usascii_str_new_cstr(const char *ptr)
00445 {
00446     VALUE str = rb_str_new2(ptr);
00447     ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00448     return str;
00449 }
00450 
00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00453 
00454 VALUE
00455 rb_tainted_str_new(const char *ptr, long len)
00456 {
00457     VALUE str = rb_str_new(ptr, len);
00458 
00459     OBJ_TAINT(str);
00460     return str;
00461 }
00462 
00463 VALUE
00464 rb_tainted_str_new_cstr(const char *ptr)
00465 {
00466     VALUE str = rb_str_new2(ptr);
00467 
00468     OBJ_TAINT(str);
00469     return str;
00470 }
00471 
00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00474 
00475 VALUE
00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00477 {
00478     rb_econv_t *ec;
00479     rb_econv_result_t ret;
00480     long len;
00481     VALUE newstr;
00482     const unsigned char *sp;
00483     unsigned char *dp;
00484 
00485     if (!to) return str;
00486     if (from == to) return str;
00487     if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00488         to == rb_ascii8bit_encoding()) {
00489         if (STR_ENC_GET(str) != to) {
00490             str = rb_str_dup(str);
00491             rb_enc_associate(str, to);
00492         }
00493         return str;
00494     }
00495 
00496     len = RSTRING_LEN(str);
00497     newstr = rb_str_new(0, len);
00498 
00499   retry:
00500     ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00501     if (!ec) return str;
00502 
00503     sp = (unsigned char*)RSTRING_PTR(str);
00504     dp = (unsigned char*)RSTRING_PTR(newstr);
00505     ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00506                            &dp, (unsigned char*)RSTRING_END(newstr), 0);
00507     rb_econv_close(ec);
00508     switch (ret) {
00509       case econv_destination_buffer_full:
00510         /* destination buffer short */
00511         len = len < 2 ? 2 : len * 2;
00512         rb_str_resize(newstr, len);
00513         goto retry;
00514 
00515       case econv_finished:
00516         len = dp - (unsigned char*)RSTRING_PTR(newstr);
00517         rb_str_set_len(newstr, len);
00518         rb_enc_associate(newstr, to);
00519         return newstr;
00520 
00521       default:
00522         /* some error, return original */
00523         return str;
00524     }
00525 }
00526 
00527 VALUE
00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00529 {
00530     return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00531 }
00532 
00533 VALUE
00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00535 {
00536     VALUE str;
00537 
00538     str = rb_tainted_str_new(ptr, len);
00539     if (eenc == rb_usascii_encoding() &&
00540         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00541         rb_enc_associate(str, rb_ascii8bit_encoding());
00542         return str;
00543     }
00544     rb_enc_associate(str, eenc);
00545     return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00546 }
00547 
00548 VALUE
00549 rb_external_str_new(const char *ptr, long len)
00550 {
00551     return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00552 }
00553 
00554 VALUE
00555 rb_external_str_new_cstr(const char *ptr)
00556 {
00557     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00558 }
00559 
00560 VALUE
00561 rb_locale_str_new(const char *ptr, long len)
00562 {
00563     return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00564 }
00565 
00566 VALUE
00567 rb_locale_str_new_cstr(const char *ptr)
00568 {
00569     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00570 }
00571 
00572 VALUE
00573 rb_filesystem_str_new(const char *ptr, long len)
00574 {
00575     return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00576 }
00577 
00578 VALUE
00579 rb_filesystem_str_new_cstr(const char *ptr)
00580 {
00581     return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00582 }
00583 
00584 VALUE
00585 rb_str_export(VALUE str)
00586 {
00587     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00588 }
00589 
00590 VALUE
00591 rb_str_export_locale(VALUE str)
00592 {
00593     return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00594 }
00595 
00596 VALUE
00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00598 {
00599     return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00600 }
00601 
00602 static VALUE
00603 str_replace_shared(VALUE str2, VALUE str)
00604 {
00605     if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00606         STR_SET_EMBED(str2);
00607         memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00608         STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00609     }
00610     else {
00611         str = rb_str_new_frozen(str);
00612         FL_SET(str2, STR_NOEMBED);
00613         RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00614         RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00615         RSTRING(str2)->as.heap.aux.shared = str;
00616         FL_SET(str2, ELTS_SHARED);
00617     }
00618     rb_enc_cr_str_exact_copy(str2, str);
00619 
00620     return str2;
00621 }
00622 
00623 static VALUE
00624 str_new_shared(VALUE klass, VALUE str)
00625 {
00626     return str_replace_shared(str_alloc(klass), str);
00627 }
00628 
00629 static VALUE
00630 str_new3(VALUE klass, VALUE str)
00631 {
00632     return str_new_shared(klass, str);
00633 }
00634 
00635 VALUE
00636 rb_str_new_shared(VALUE str)
00637 {
00638     VALUE str2 = str_new3(rb_obj_class(str), str);
00639 
00640     OBJ_INFECT(str2, str);
00641     return str2;
00642 }
00643 
00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00645 #define rb_str_new3 rb_str_new_shared
00646 
00647 static VALUE
00648 str_new4(VALUE klass, VALUE str)
00649 {
00650     VALUE str2;
00651 
00652     str2 = str_alloc(klass);
00653     STR_SET_NOEMBED(str2);
00654     RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00655     RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00656     if (STR_SHARED_P(str)) {
00657         VALUE shared = RSTRING(str)->as.heap.aux.shared;
00658         assert(OBJ_FROZEN(shared));
00659         FL_SET(str2, ELTS_SHARED);
00660         RSTRING(str2)->as.heap.aux.shared = shared;
00661     }
00662     else {
00663         FL_SET(str, ELTS_SHARED);
00664         RSTRING(str)->as.heap.aux.shared = str2;
00665     }
00666     rb_enc_cr_str_exact_copy(str2, str);
00667     OBJ_INFECT(str2, str);
00668     return str2;
00669 }
00670 
00671 VALUE
00672 rb_str_new_frozen(VALUE orig)
00673 {
00674     VALUE klass, str;
00675 
00676     if (OBJ_FROZEN(orig)) return orig;
00677     klass = rb_obj_class(orig);
00678     if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00679         long ofs;
00680         assert(OBJ_FROZEN(str));
00681         ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00682         if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00683             (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00684             ENCODING_GET(str) != ENCODING_GET(orig)) {
00685             str = str_new3(klass, str);
00686             RSTRING(str)->as.heap.ptr += ofs;
00687             RSTRING(str)->as.heap.len -= ofs;
00688             rb_enc_cr_str_exact_copy(str, orig);
00689             OBJ_INFECT(str, orig);
00690         }
00691     }
00692     else if (STR_EMBED_P(orig)) {
00693         str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00694         rb_enc_cr_str_exact_copy(str, orig);
00695         OBJ_INFECT(str, orig);
00696     }
00697     else if (STR_ASSOC_P(orig)) {
00698         VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00699         FL_UNSET(orig, STR_ASSOC);
00700         str = str_new4(klass, orig);
00701         FL_SET(str, STR_ASSOC);
00702         RSTRING(str)->as.heap.aux.shared = assoc;
00703     }
00704     else {
00705         str = str_new4(klass, orig);
00706     }
00707     OBJ_FREEZE(str);
00708     return str;
00709 }
00710 
00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00712 #define rb_str_new4 rb_str_new_frozen
00713 
00714 VALUE
00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00716 {
00717     return str_new(rb_obj_class(obj), ptr, len);
00718 }
00719 
00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00721            rb_str_new_with_class, (obj, ptr, len))
00722 #define rb_str_new5 rb_str_new_with_class
00723 
00724 static VALUE
00725 str_new_empty(VALUE str)
00726 {
00727     VALUE v = rb_str_new5(str, 0, 0);
00728     rb_enc_copy(v, str);
00729     OBJ_INFECT(v, str);
00730     return v;
00731 }
00732 
00733 #define STR_BUF_MIN_SIZE 128
00734 
00735 VALUE
00736 rb_str_buf_new(long capa)
00737 {
00738     VALUE str = str_alloc(rb_cString);
00739 
00740     if (capa < STR_BUF_MIN_SIZE) {
00741         capa = STR_BUF_MIN_SIZE;
00742     }
00743     FL_SET(str, STR_NOEMBED);
00744     RSTRING(str)->as.heap.aux.capa = capa;
00745     RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00746     RSTRING(str)->as.heap.ptr[0] = '\0';
00747 
00748     return str;
00749 }
00750 
00751 VALUE
00752 rb_str_buf_new_cstr(const char *ptr)
00753 {
00754     VALUE str;
00755     long len = strlen(ptr);
00756 
00757     str = rb_str_buf_new(len);
00758     rb_str_buf_cat(str, ptr, len);
00759 
00760     return str;
00761 }
00762 
00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00764 #define rb_str_buf_new2 rb_str_buf_new_cstr
00765 
00766 VALUE
00767 rb_str_tmp_new(long len)
00768 {
00769     return str_new(0, 0, len);
00770 }
00771 
00772 void *
00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00774 {
00775     VALUE s = rb_str_tmp_new(len);
00776     *store = s;
00777     return RSTRING_PTR(s);
00778 }
00779 
00780 void
00781 rb_free_tmp_buffer(volatile VALUE *store)
00782 {
00783     VALUE s = *store;
00784     *store = 0;
00785     if (s) rb_str_clear(s);
00786 }
00787 
00788 void
00789 rb_str_free(VALUE str)
00790 {
00791     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00792         xfree(RSTRING(str)->as.heap.ptr);
00793     }
00794 }
00795 
00796 RUBY_FUNC_EXPORTED size_t
00797 rb_str_memsize(VALUE str)
00798 {
00799     if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00800         return RSTRING(str)->as.heap.aux.capa;
00801     }
00802     else {
00803         return 0;
00804     }
00805 }
00806 
00807 VALUE
00808 rb_str_to_str(VALUE str)
00809 {
00810     return rb_convert_type(str, T_STRING, "String", "to_str");
00811 }
00812 
00813 static inline void str_discard(VALUE str);
00814 
00815 void
00816 rb_str_shared_replace(VALUE str, VALUE str2)
00817 {
00818     rb_encoding *enc;
00819     int cr;
00820     if (str == str2) return;
00821     enc = STR_ENC_GET(str2);
00822     cr = ENC_CODERANGE(str2);
00823     str_discard(str);
00824     OBJ_INFECT(str, str2);
00825     if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00826         STR_SET_EMBED(str);
00827         memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00828         STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00829         rb_enc_associate(str, enc);
00830         ENC_CODERANGE_SET(str, cr);
00831         return;
00832     }
00833     STR_SET_NOEMBED(str);
00834     STR_UNSET_NOCAPA(str);
00835     RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00836     RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00837     if (STR_NOCAPA_P(str2)) {
00838         FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00839         RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00840     }
00841     else {
00842         RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00843     }
00844     STR_SET_EMBED(str2);        /* abandon str2 */
00845     RSTRING_PTR(str2)[0] = 0;
00846     STR_SET_EMBED_LEN(str2, 0);
00847     rb_enc_associate(str, enc);
00848     ENC_CODERANGE_SET(str, cr);
00849 }
00850 
00851 static ID id_to_s;
00852 
00853 VALUE
00854 rb_obj_as_string(VALUE obj)
00855 {
00856     VALUE str;
00857 
00858     if (TYPE(obj) == T_STRING) {
00859         return obj;
00860     }
00861     str = rb_funcall(obj, id_to_s, 0);
00862     if (TYPE(str) != T_STRING)
00863         return rb_any_to_s(obj);
00864     if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00865     return str;
00866 }
00867 
00868 static VALUE
00869 str_replace(VALUE str, VALUE str2)
00870 {
00871     long len;
00872 
00873     len = RSTRING_LEN(str2);
00874     if (STR_ASSOC_P(str2)) {
00875         str2 = rb_str_new4(str2);
00876     }
00877     if (STR_SHARED_P(str2)) {
00878         VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00879         assert(OBJ_FROZEN(shared));
00880         STR_SET_NOEMBED(str);
00881         RSTRING(str)->as.heap.len = len;
00882         RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00883         FL_SET(str, ELTS_SHARED);
00884         FL_UNSET(str, STR_ASSOC);
00885         RSTRING(str)->as.heap.aux.shared = shared;
00886     }
00887     else {
00888         str_replace_shared(str, str2);
00889     }
00890 
00891     OBJ_INFECT(str, str2);
00892     rb_enc_cr_str_exact_copy(str, str2);
00893     return str;
00894 }
00895 
00896 static VALUE
00897 str_duplicate(VALUE klass, VALUE str)
00898 {
00899     VALUE dup = str_alloc(klass);
00900     str_replace(dup, str);
00901     return dup;
00902 }
00903 
00904 VALUE
00905 rb_str_dup(VALUE str)
00906 {
00907     return str_duplicate(rb_obj_class(str), str);
00908 }
00909 
00910 VALUE
00911 rb_str_resurrect(VALUE str)
00912 {
00913     return str_replace(str_alloc(rb_cString), str);
00914 }
00915 
00916 /*
00917  *  call-seq:
00918  *     String.new(str="")   -> new_str
00919  *
00920  *  Returns a new string object containing a copy of <i>str</i>.
00921  */
00922 
00923 static VALUE
00924 rb_str_init(int argc, VALUE *argv, VALUE str)
00925 {
00926     VALUE orig;
00927 
00928     if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00929         rb_str_replace(str, orig);
00930     return str;
00931 }
00932 
00933 static inline long
00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00935 {
00936     long c;
00937     const char *q;
00938 
00939     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00940         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00941     }
00942     else if (rb_enc_asciicompat(enc)) {
00943         c = 0;
00944         if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00945             while (p < e) {
00946                 if (ISASCII(*p)) {
00947                     q = search_nonascii(p, e);
00948                     if (!q)
00949                         return c + (e - p);
00950                     c += q - p;
00951                     p = q;
00952                 }
00953                 p += rb_enc_fast_mbclen(p, e, enc);
00954                 c++;
00955             }
00956         }
00957         else {
00958             while (p < e) {
00959                 if (ISASCII(*p)) {
00960                     q = search_nonascii(p, e);
00961                     if (!q)
00962                         return c + (e - p);
00963                     c += q - p;
00964                     p = q;
00965                 }
00966                 p += rb_enc_mbclen(p, e, enc);
00967                 c++;
00968             }
00969         }
00970         return c;
00971     }
00972 
00973     for (c=0; p<e; c++) {
00974         p += rb_enc_mbclen(p, e, enc);
00975     }
00976     return c;
00977 }
00978 
00979 long
00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00981 {
00982     return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00983 }
00984 
00985 long
00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00987 {
00988     long c;
00989     const char *q;
00990     int ret;
00991 
00992     *cr = 0;
00993     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00994         return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00995     }
00996     else if (rb_enc_asciicompat(enc)) {
00997         c = 0;
00998         while (p < e) {
00999             if (ISASCII(*p)) {
01000                 q = search_nonascii(p, e);
01001                 if (!q) {
01002                     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01003                     return c + (e - p);
01004                 }
01005                 c += q - p;
01006                 p = q;
01007             }
01008             ret = rb_enc_precise_mbclen(p, e, enc);
01009             if (MBCLEN_CHARFOUND_P(ret)) {
01010                 *cr |= ENC_CODERANGE_VALID;
01011                 p += MBCLEN_CHARFOUND_LEN(ret);
01012             }
01013             else {
01014                 *cr = ENC_CODERANGE_BROKEN;
01015                 p++;
01016             }
01017             c++;
01018         }
01019         if (!*cr) *cr = ENC_CODERANGE_7BIT;
01020         return c;
01021     }
01022 
01023     for (c=0; p<e; c++) {
01024         ret = rb_enc_precise_mbclen(p, e, enc);
01025         if (MBCLEN_CHARFOUND_P(ret)) {
01026             *cr |= ENC_CODERANGE_VALID;
01027             p += MBCLEN_CHARFOUND_LEN(ret);
01028         }
01029         else {
01030             *cr = ENC_CODERANGE_BROKEN;
01031             if (p + rb_enc_mbminlen(enc) <= e)
01032                 p += rb_enc_mbminlen(enc);
01033             else
01034                 p = e;
01035         }
01036     }
01037     if (!*cr) *cr = ENC_CODERANGE_7BIT;
01038     return c;
01039 }
01040 
01041 #ifdef NONASCII_MASK
01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01043 
01044 /*
01045  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
01046  * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
01047  * Therefore, following pseudo code can detect UTF-8 leading byte.
01048  *
01049  * if (!(byte & 0x80))
01050  *   byte |= 0x40;          // turn on bit6
01051  * return ((byte>>6) & 1);  // bit6 represent it's leading byte or not.
01052  *
01053  * This function calculate every bytes in the argument word `s'
01054  * using the above logic concurrently. and gather every bytes result.
01055  */
01056 static inline VALUE
01057 count_utf8_lead_bytes_with_word(const VALUE *s)
01058 {
01059     VALUE d = *s;
01060 
01061     /* Transform into bit0 represent UTF-8 leading or not. */
01062     d |= ~(d>>1);
01063     d >>= 6;
01064     d &= NONASCII_MASK >> 7;
01065 
01066     /* Gather every bytes. */
01067     d += (d>>8);
01068     d += (d>>16);
01069 #if SIZEOF_VALUE == 8
01070     d += (d>>32);
01071 #endif
01072     return (d&0xF);
01073 }
01074 #endif
01075 
01076 static long
01077 str_strlen(VALUE str, rb_encoding *enc)
01078 {
01079     const char *p, *e;
01080     long n;
01081     int cr;
01082 
01083     if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01084     if (!enc) enc = STR_ENC_GET(str);
01085     p = RSTRING_PTR(str);
01086     e = RSTRING_END(str);
01087     cr = ENC_CODERANGE(str);
01088 #ifdef NONASCII_MASK
01089     if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01090         enc == rb_utf8_encoding()) {
01091 
01092         VALUE len = 0;
01093         if ((int)sizeof(VALUE) * 2 < e - p) {
01094             const VALUE *s, *t;
01095             const VALUE lowbits = sizeof(VALUE) - 1;
01096             s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01097             t = (const VALUE*)(~lowbits & (VALUE)e);
01098             while (p < (const char *)s) {
01099                 if (is_utf8_lead_byte(*p)) len++;
01100                 p++;
01101             }
01102             while (s < t) {
01103                 len += count_utf8_lead_bytes_with_word(s);
01104                 s++;
01105             }
01106             p = (const char *)s;
01107         }
01108         while (p < e) {
01109             if (is_utf8_lead_byte(*p)) len++;
01110             p++;
01111         }
01112         return (long)len;
01113     }
01114 #endif
01115     n = rb_enc_strlen_cr(p, e, enc, &cr);
01116     if (cr) {
01117         ENC_CODERANGE_SET(str, cr);
01118     }
01119     return n;
01120 }
01121 
01122 long
01123 rb_str_strlen(VALUE str)
01124 {
01125     return str_strlen(str, STR_ENC_GET(str));
01126 }
01127 
01128 /*
01129  *  call-seq:
01130  *     str.length   -> integer
01131  *     str.size     -> integer
01132  *
01133  *  Returns the character length of <i>str</i>.
01134  */
01135 
01136 VALUE
01137 rb_str_length(VALUE str)
01138 {
01139     long len;
01140 
01141     len = str_strlen(str, STR_ENC_GET(str));
01142     return LONG2NUM(len);
01143 }
01144 
01145 /*
01146  *  call-seq:
01147  *     str.bytesize  -> integer
01148  *
01149  *  Returns the length of <i>str</i> in bytes.
01150  */
01151 
01152 static VALUE
01153 rb_str_bytesize(VALUE str)
01154 {
01155     return LONG2NUM(RSTRING_LEN(str));
01156 }
01157 
01158 /*
01159  *  call-seq:
01160  *     str.empty?   -> true or false
01161  *
01162  *  Returns <code>true</code> if <i>str</i> has a length of zero.
01163  *
01164  *     "hello".empty?   #=> false
01165  *     "".empty?        #=> true
01166  */
01167 
01168 static VALUE
01169 rb_str_empty(VALUE str)
01170 {
01171     if (RSTRING_LEN(str) == 0)
01172         return Qtrue;
01173     return Qfalse;
01174 }
01175 
01176 /*
01177  *  call-seq:
01178  *     str + other_str   -> new_str
01179  *
01180  *  Concatenation---Returns a new <code>String</code> containing
01181  *  <i>other_str</i> concatenated to <i>str</i>.
01182  *
01183  *     "Hello from " + self.to_s   #=> "Hello from main"
01184  */
01185 
01186 VALUE
01187 rb_str_plus(VALUE str1, VALUE str2)
01188 {
01189     VALUE str3;
01190     rb_encoding *enc;
01191 
01192     StringValue(str2);
01193     enc = rb_enc_check(str1, str2);
01194     str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01195     memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01196     memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01197            RSTRING_PTR(str2), RSTRING_LEN(str2));
01198     RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01199 
01200     if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01201         OBJ_TAINT(str3);
01202     ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01203                            ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01204     return str3;
01205 }
01206 
01207 /*
01208  *  call-seq:
01209  *     str * integer   -> new_str
01210  *
01211  *  Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
01212  *  the receiver.
01213  *
01214  *     "Ho! " * 3   #=> "Ho! Ho! Ho! "
01215  */
01216 
01217 VALUE
01218 rb_str_times(VALUE str, VALUE times)
01219 {
01220     VALUE str2;
01221     long n, len;
01222     char *ptr2;
01223 
01224     len = NUM2LONG(times);
01225     if (len < 0) {
01226         rb_raise(rb_eArgError, "negative argument");
01227     }
01228     if (len && LONG_MAX/len <  RSTRING_LEN(str)) {
01229         rb_raise(rb_eArgError, "argument too big");
01230     }
01231 
01232     str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01233     ptr2 = RSTRING_PTR(str2);
01234     if (len) {
01235         n = RSTRING_LEN(str);
01236         memcpy(ptr2, RSTRING_PTR(str), n);
01237         while (n <= len/2) {
01238             memcpy(ptr2 + n, ptr2, n);
01239             n *= 2;
01240         }
01241         memcpy(ptr2 + n, ptr2, len-n);
01242     }
01243     ptr2[RSTRING_LEN(str2)] = '\0';
01244     OBJ_INFECT(str2, str);
01245     rb_enc_cr_str_copy_for_substr(str2, str);
01246 
01247     return str2;
01248 }
01249 
01250 /*
01251  *  call-seq:
01252  *     str % arg   -> new_str
01253  *
01254  *  Format---Uses <i>str</i> as a format specification, and returns the result
01255  *  of applying it to <i>arg</i>. If the format specification contains more than
01256  *  one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
01257  *  containing the values to be substituted. See <code>Kernel::sprintf</code> for
01258  *  details of the format string.
01259  *
01260  *     "%05d" % 123                              #=> "00123"
01261  *     "%-5s: %08x" % [ "ID", self.object_id ]   #=> "ID   : 200e14d6"
01262  *     "foo = %{foo}" % { :foo => 'bar' }        #=> "foo = bar"
01263  */
01264 
01265 static VALUE
01266 rb_str_format_m(VALUE str, VALUE arg)
01267 {
01268     volatile VALUE tmp = rb_check_array_type(arg);
01269 
01270     if (!NIL_P(tmp)) {
01271         return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01272     }
01273     return rb_str_format(1, &arg, str);
01274 }
01275 
01276 static inline void
01277 str_modifiable(VALUE str)
01278 {
01279     if (FL_TEST(str, STR_TMPLOCK)) {
01280         rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01281     }
01282     rb_check_frozen(str);
01283     if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01284         rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01285 }
01286 
01287 static inline int
01288 str_independent(VALUE str)
01289 {
01290     str_modifiable(str);
01291     if (!STR_SHARED_P(str)) return 1;
01292     if (STR_EMBED_P(str)) return 1;
01293     return 0;
01294 }
01295 
01296 static void
01297 str_make_independent_expand(VALUE str, long expand)
01298 {
01299     char *ptr;
01300     long len = RSTRING_LEN(str);
01301     long capa = len + expand;
01302 
01303     if (len > capa) len = capa;
01304     ptr = ALLOC_N(char, capa + 1);
01305     if (RSTRING_PTR(str)) {
01306         memcpy(ptr, RSTRING_PTR(str), len);
01307     }
01308     STR_SET_NOEMBED(str);
01309     STR_UNSET_NOCAPA(str);
01310     ptr[len] = 0;
01311     RSTRING(str)->as.heap.ptr = ptr;
01312     RSTRING(str)->as.heap.len = len;
01313     RSTRING(str)->as.heap.aux.capa = capa;
01314 }
01315 
01316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01317 
01318 void
01319 rb_str_modify(VALUE str)
01320 {
01321     if (!str_independent(str))
01322         str_make_independent(str);
01323     ENC_CODERANGE_CLEAR(str);
01324 }
01325 
01326 void
01327 rb_str_modify_expand(VALUE str, long expand)
01328 {
01329     if (expand < 0) {
01330         rb_raise(rb_eArgError, "negative expanding string size");
01331     }
01332     if (!str_independent(str)) {
01333         str_make_independent_expand(str, expand);
01334     }
01335     else if (expand > 0) {
01336         long len = RSTRING_LEN(str);
01337         long capa = len + expand;
01338         if (!STR_EMBED_P(str)) {
01339             REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01340             STR_UNSET_NOCAPA(str);
01341             RSTRING(str)->as.heap.aux.capa = capa;
01342         }
01343         else if (capa > RSTRING_EMBED_LEN_MAX) {
01344             str_make_independent_expand(str, expand);
01345         }
01346     }
01347     ENC_CODERANGE_CLEAR(str);
01348 }
01349 
01350 /* As rb_str_modify(), but don't clear coderange */
01351 static void
01352 str_modify_keep_cr(VALUE str)
01353 {
01354     if (!str_independent(str))
01355         str_make_independent(str);
01356     if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01357         /* Force re-scan later */
01358         ENC_CODERANGE_CLEAR(str);
01359 }
01360 
01361 static inline void
01362 str_discard(VALUE str)
01363 {
01364     str_modifiable(str);
01365     if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01366         xfree(RSTRING_PTR(str));
01367         RSTRING(str)->as.heap.ptr = 0;
01368         RSTRING(str)->as.heap.len = 0;
01369     }
01370 }
01371 
01372 void
01373 rb_str_associate(VALUE str, VALUE add)
01374 {
01375     /* sanity check */
01376     rb_check_frozen(str);
01377     if (STR_ASSOC_P(str)) {
01378         /* already associated */
01379         rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01380     }
01381     else {
01382         if (STR_SHARED_P(str)) {
01383             VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01384             str_make_independent(str);
01385             if (STR_ASSOC_P(assoc)) {
01386                 assoc = RSTRING(assoc)->as.heap.aux.shared;
01387                 rb_ary_concat(assoc, add);
01388                 add = assoc;
01389             }
01390         }
01391         else if (STR_EMBED_P(str)) {
01392             str_make_independent(str);
01393         }
01394         else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01395             RESIZE_CAPA(str, RSTRING_LEN(str));
01396         }
01397         FL_SET(str, STR_ASSOC);
01398         RBASIC(add)->klass = 0;
01399         RSTRING(str)->as.heap.aux.shared = add;
01400     }
01401 }
01402 
01403 VALUE
01404 rb_str_associated(VALUE str)
01405 {
01406     if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01407     if (STR_ASSOC_P(str)) {
01408         return RSTRING(str)->as.heap.aux.shared;
01409     }
01410     return Qfalse;
01411 }
01412 
01413 VALUE
01414 rb_string_value(volatile VALUE *ptr)
01415 {
01416     VALUE s = *ptr;
01417     if (TYPE(s) != T_STRING) {
01418         s = rb_str_to_str(s);
01419         *ptr = s;
01420     }
01421     return s;
01422 }
01423 
01424 char *
01425 rb_string_value_ptr(volatile VALUE *ptr)
01426 {
01427     VALUE str = rb_string_value(ptr);
01428     return RSTRING_PTR(str);
01429 }
01430 
01431 char *
01432 rb_string_value_cstr(volatile VALUE *ptr)
01433 {
01434     VALUE str = rb_string_value(ptr);
01435     char *s = RSTRING_PTR(str);
01436     long len = RSTRING_LEN(str);
01437 
01438     if (!s || memchr(s, 0, len)) {
01439         rb_raise(rb_eArgError, "string contains null byte");
01440     }
01441     if (s[len]) {
01442         rb_str_modify(str);
01443         s = RSTRING_PTR(str);
01444         s[RSTRING_LEN(str)] = 0;
01445     }
01446     return s;
01447 }
01448 
01449 VALUE
01450 rb_check_string_type(VALUE str)
01451 {
01452     str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01453     return str;
01454 }
01455 
01456 /*
01457  *  call-seq:
01458  *     String.try_convert(obj) -> string or nil
01459  *
01460  *  Try to convert <i>obj</i> into a String, using to_str method.
01461  *  Returns converted string or nil if <i>obj</i> cannot be converted
01462  *  for any reason.
01463  *
01464  *     String.try_convert("str")     #=> "str"
01465  *     String.try_convert(/re/)      #=> nil
01466  */
01467 static VALUE
01468 rb_str_s_try_convert(VALUE dummy, VALUE str)
01469 {
01470     return rb_check_string_type(str);
01471 }
01472 
01473 static char*
01474 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01475 {
01476     long nth = *nthp;
01477     if (rb_enc_mbmaxlen(enc) == 1) {
01478         p += nth;
01479     }
01480     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01481         p += nth * rb_enc_mbmaxlen(enc);
01482     }
01483     else if (rb_enc_asciicompat(enc)) {
01484         const char *p2, *e2;
01485         int n;
01486 
01487         while (p < e && 0 < nth) {
01488             e2 = p + nth;
01489             if (e < e2) {
01490                 *nthp = nth;
01491                 return (char *)e;
01492             }
01493             if (ISASCII(*p)) {
01494                 p2 = search_nonascii(p, e2);
01495                 if (!p2) {
01496                     *nthp = nth;
01497                     return (char *)e2;
01498                 }
01499                 nth -= p2 - p;
01500                 p = p2;
01501             }
01502             n = rb_enc_mbclen(p, e, enc);
01503             p += n;
01504             nth--;
01505         }
01506         *nthp = nth;
01507         if (nth != 0) {
01508             return (char *)e;
01509         }
01510         return (char *)p;
01511     }
01512     else {
01513         while (p < e && nth--) {
01514             p += rb_enc_mbclen(p, e, enc);
01515         }
01516     }
01517     if (p > e) p = e;
01518     *nthp = nth;
01519     return (char*)p;
01520 }
01521 
01522 char*
01523 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01524 {
01525     return str_nth_len(p, e, &nth, enc);
01526 }
01527 
01528 static char*
01529 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01530 {
01531     if (singlebyte)
01532         p += nth;
01533     else {
01534         p = str_nth_len(p, e, &nth, enc);
01535     }
01536     if (!p) return 0;
01537     if (p > e) p = e;
01538     return (char *)p;
01539 }
01540 
01541 /* char offset to byte offset */
01542 static long
01543 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01544 {
01545     const char *pp = str_nth(p, e, nth, enc, singlebyte);
01546     if (!pp) return e - p;
01547     return pp - p;
01548 }
01549 
01550 long
01551 rb_str_offset(VALUE str, long pos)
01552 {
01553     return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01554                       STR_ENC_GET(str), single_byte_optimizable(str));
01555 }
01556 
01557 #ifdef NONASCII_MASK
01558 static char *
01559 str_utf8_nth(const char *p, const char *e, long *nthp)
01560 {
01561     long nth = *nthp;
01562     if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01563         const VALUE *s, *t;
01564         const VALUE lowbits = sizeof(VALUE) - 1;
01565         s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01566         t = (const VALUE*)(~lowbits & (VALUE)e);
01567         while (p < (const char *)s) {
01568             if (is_utf8_lead_byte(*p)) nth--;
01569             p++;
01570         }
01571         do {
01572             nth -= count_utf8_lead_bytes_with_word(s);
01573             s++;
01574         } while (s < t && (int)sizeof(VALUE) <= nth);
01575         p = (char *)s;
01576     }
01577     while (p < e) {
01578         if (is_utf8_lead_byte(*p)) {
01579             if (nth == 0) break;
01580             nth--;
01581         }
01582         p++;
01583     }
01584     *nthp = nth;
01585     return (char *)p;
01586 }
01587 
01588 static long
01589 str_utf8_offset(const char *p, const char *e, long nth)
01590 {
01591     const char *pp = str_utf8_nth(p, e, &nth);
01592     return pp - p;
01593 }
01594 #endif
01595 
01596 /* byte offset to char offset */
01597 long
01598 rb_str_sublen(VALUE str, long pos)
01599 {
01600     if (single_byte_optimizable(str) || pos < 0)
01601         return pos;
01602     else {
01603         char *p = RSTRING_PTR(str);
01604         return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01605     }
01606 }
01607 
01608 VALUE
01609 rb_str_subseq(VALUE str, long beg, long len)
01610 {
01611     VALUE str2;
01612 
01613     if (RSTRING_LEN(str) == beg + len &&
01614         RSTRING_EMBED_LEN_MAX < len) {
01615         str2 = rb_str_new_shared(rb_str_new_frozen(str));
01616         rb_str_drop_bytes(str2, beg);
01617     }
01618     else {
01619         str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01620     }
01621 
01622     rb_enc_cr_str_copy_for_substr(str2, str);
01623     OBJ_INFECT(str2, str);
01624 
01625     return str2;
01626 }
01627 
01628 VALUE
01629 rb_str_substr(VALUE str, long beg, long len)
01630 {
01631     rb_encoding *enc = STR_ENC_GET(str);
01632     VALUE str2;
01633     char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01634 
01635     if (len < 0) return Qnil;
01636     if (!RSTRING_LEN(str)) {
01637         len = 0;
01638     }
01639     if (single_byte_optimizable(str)) {
01640         if (beg > RSTRING_LEN(str)) return Qnil;
01641         if (beg < 0) {
01642             beg += RSTRING_LEN(str);
01643             if (beg < 0) return Qnil;
01644         }
01645         if (beg + len > RSTRING_LEN(str))
01646             len = RSTRING_LEN(str) - beg;
01647         if (len <= 0) {
01648             len = 0;
01649             p = 0;
01650         }
01651         else
01652             p = s + beg;
01653         goto sub;
01654     }
01655     if (beg < 0) {
01656         if (len > -beg) len = -beg;
01657         if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01658             beg = -beg;
01659             while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01660             p = e;
01661             if (!p) return Qnil;
01662             while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01663             if (!p) return Qnil;
01664             len = e - p;
01665             goto sub;
01666         }
01667         else {
01668             beg += str_strlen(str, enc);
01669             if (beg < 0) return Qnil;
01670         }
01671     }
01672     else if (beg > 0 && beg > RSTRING_LEN(str)) {
01673         return Qnil;
01674     }
01675     if (len == 0) {
01676         if (beg > str_strlen(str, enc)) return Qnil;
01677         p = 0;
01678     }
01679 #ifdef NONASCII_MASK
01680     else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01681         enc == rb_utf8_encoding()) {
01682         p = str_utf8_nth(s, e, &beg);
01683         if (beg > 0) return Qnil;
01684         len = str_utf8_offset(p, e, len);
01685     }
01686 #endif
01687     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01688         int char_sz = rb_enc_mbmaxlen(enc);
01689 
01690         p = s + beg * char_sz;
01691         if (p > e) {
01692             return Qnil;
01693         }
01694         else if (len * char_sz > e - p)
01695             len = e - p;
01696         else
01697             len *= char_sz;
01698     }
01699     else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01700         if (beg > 0) return Qnil;
01701         len = 0;
01702     }
01703     else {
01704         len = str_offset(p, e, len, enc, 0);
01705     }
01706   sub:
01707     if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01708         str2 = rb_str_new4(str);
01709         str2 = str_new3(rb_obj_class(str2), str2);
01710         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01711         RSTRING(str2)->as.heap.len = len;
01712     }
01713     else {
01714         str2 = rb_str_new5(str, p, len);
01715         rb_enc_cr_str_copy_for_substr(str2, str);
01716         OBJ_INFECT(str2, str);
01717     }
01718 
01719     return str2;
01720 }
01721 
01722 VALUE
01723 rb_str_freeze(VALUE str)
01724 {
01725     if (STR_ASSOC_P(str)) {
01726         VALUE ary = RSTRING(str)->as.heap.aux.shared;
01727         OBJ_FREEZE(ary);
01728     }
01729     return rb_obj_freeze(str);
01730 }
01731 
01732 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01733 #define rb_str_dup_frozen rb_str_new_frozen
01734 
01735 VALUE
01736 rb_str_locktmp(VALUE str)
01737 {
01738     if (FL_TEST(str, STR_TMPLOCK)) {
01739         rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01740     }
01741     FL_SET(str, STR_TMPLOCK);
01742     return str;
01743 }
01744 
01745 VALUE
01746 rb_str_unlocktmp(VALUE str)
01747 {
01748     if (!FL_TEST(str, STR_TMPLOCK)) {
01749         rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01750     }
01751     FL_UNSET(str, STR_TMPLOCK);
01752     return str;
01753 }
01754 
01755 VALUE
01756 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
01757 {
01758     rb_str_locktmp(str);
01759     return rb_ensure(func, arg, rb_str_unlocktmp, str);
01760 }
01761 
01762 void
01763 rb_str_set_len(VALUE str, long len)
01764 {
01765     long capa;
01766 
01767     str_modifiable(str);
01768     if (STR_SHARED_P(str)) {
01769         rb_raise(rb_eRuntimeError, "can't set length of shared string");
01770     }
01771     if (len > (capa = (long)rb_str_capacity(str))) {
01772         rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01773     }
01774     STR_SET_LEN(str, len);
01775     RSTRING_PTR(str)[len] = '\0';
01776 }
01777 
01778 VALUE
01779 rb_str_resize(VALUE str, long len)
01780 {
01781     long slen;
01782     int independent;
01783 
01784     if (len < 0) {
01785         rb_raise(rb_eArgError, "negative string size (or size too big)");
01786     }
01787 
01788     independent = str_independent(str);
01789     ENC_CODERANGE_CLEAR(str);
01790     slen = RSTRING_LEN(str);
01791     if (len != slen) {
01792         if (STR_EMBED_P(str)) {
01793             if (len <= RSTRING_EMBED_LEN_MAX) {
01794                 STR_SET_EMBED_LEN(str, len);
01795                 RSTRING(str)->as.ary[len] = '\0';
01796                 return str;
01797             }
01798             str_make_independent_expand(str, len - slen);
01799             STR_SET_NOEMBED(str);
01800         }
01801         else if (len <= RSTRING_EMBED_LEN_MAX) {
01802             char *ptr = RSTRING(str)->as.heap.ptr;
01803             STR_SET_EMBED(str);
01804             if (slen > len) slen = len;
01805             if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01806             RSTRING(str)->as.ary[len] = '\0';
01807             STR_SET_EMBED_LEN(str, len);
01808             if (independent) xfree(ptr);
01809             return str;
01810         }
01811         else if (!independent) {
01812             str_make_independent_expand(str, len - slen);
01813         }
01814         else if (slen < len || slen - len > 1024) {
01815             REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01816         }
01817         if (!STR_NOCAPA_P(str)) {
01818             RSTRING(str)->as.heap.aux.capa = len;
01819         }
01820         RSTRING(str)->as.heap.len = len;
01821         RSTRING(str)->as.heap.ptr[len] = '\0';  /* sentinel */
01822     }
01823     return str;
01824 }
01825 
01826 static VALUE
01827 str_buf_cat(VALUE str, const char *ptr, long len)
01828 {
01829     long capa, total, off = -1;
01830 
01831     if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01832         off = ptr - RSTRING_PTR(str);
01833     }
01834     rb_str_modify(str);
01835     if (len == 0) return 0;
01836     if (STR_ASSOC_P(str)) {
01837         FL_UNSET(str, STR_ASSOC);
01838         capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01839     }
01840     else if (STR_EMBED_P(str)) {
01841         capa = RSTRING_EMBED_LEN_MAX;
01842     }
01843     else {
01844         capa = RSTRING(str)->as.heap.aux.capa;
01845     }
01846     if (RSTRING_LEN(str) >= LONG_MAX - len) {
01847         rb_raise(rb_eArgError, "string sizes too big");
01848     }
01849     total = RSTRING_LEN(str)+len;
01850     if (capa <= total) {
01851         while (total > capa) {
01852             if (capa + 1 >= LONG_MAX / 2) {
01853                 capa = (total + 4095) / 4096;
01854                 break;
01855             }
01856             capa = (capa + 1) * 2;
01857         }
01858         RESIZE_CAPA(str, capa);
01859     }
01860     if (off != -1) {
01861         ptr = RSTRING_PTR(str) + off;
01862     }
01863     memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01864     STR_SET_LEN(str, total);
01865     RSTRING_PTR(str)[total] = '\0'; /* sentinel */
01866 
01867     return str;
01868 }
01869 
01870 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01871 
01872 VALUE
01873 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01874 {
01875     if (len == 0) return str;
01876     if (len < 0) {
01877         rb_raise(rb_eArgError, "negative string size (or size too big)");
01878     }
01879     return str_buf_cat(str, ptr, len);
01880 }
01881 
01882 VALUE
01883 rb_str_buf_cat2(VALUE str, const char *ptr)
01884 {
01885     return rb_str_buf_cat(str, ptr, strlen(ptr));
01886 }
01887 
01888 VALUE
01889 rb_str_cat(VALUE str, const char *ptr, long len)
01890 {
01891     if (len < 0) {
01892         rb_raise(rb_eArgError, "negative string size (or size too big)");
01893     }
01894     if (STR_ASSOC_P(str)) {
01895         char *p;
01896         rb_str_modify_expand(str, len);
01897         p = RSTRING(str)->as.heap.ptr;
01898         memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01899         len = RSTRING(str)->as.heap.len += len;
01900         p[len] = '\0'; /* sentinel */
01901         return str;
01902     }
01903 
01904     return rb_str_buf_cat(str, ptr, len);
01905 }
01906 
01907 VALUE
01908 rb_str_cat2(VALUE str, const char *ptr)
01909 {
01910     return rb_str_cat(str, ptr, strlen(ptr));
01911 }
01912 
01913 static VALUE
01914 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01915     int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01916 {
01917     int str_encindex = ENCODING_GET(str);
01918     int res_encindex;
01919     int str_cr, res_cr;
01920 
01921     str_cr = ENC_CODERANGE(str);
01922 
01923     if (str_encindex == ptr_encindex) {
01924         if (str_cr == ENC_CODERANGE_UNKNOWN)
01925             ptr_cr = ENC_CODERANGE_UNKNOWN;
01926         else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01927             ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01928         }
01929     }
01930     else {
01931         rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01932         rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01933         if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01934             if (len == 0)
01935                 return str;
01936             if (RSTRING_LEN(str) == 0) {
01937                 rb_str_buf_cat(str, ptr, len);
01938                 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01939                 return str;
01940             }
01941             goto incompatible;
01942         }
01943         if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01944             ptr_cr = coderange_scan(ptr, len, ptr_enc);
01945         }
01946         if (str_cr == ENC_CODERANGE_UNKNOWN) {
01947             if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
01948                 str_cr = rb_enc_str_coderange(str);
01949             }
01950         }
01951     }
01952     if (ptr_cr_ret)
01953         *ptr_cr_ret = ptr_cr;
01954 
01955     if (str_encindex != ptr_encindex &&
01956         str_cr != ENC_CODERANGE_7BIT &&
01957         ptr_cr != ENC_CODERANGE_7BIT) {
01958       incompatible:
01959         rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01960             rb_enc_name(rb_enc_from_index(str_encindex)),
01961             rb_enc_name(rb_enc_from_index(ptr_encindex)));
01962     }
01963 
01964     if (str_cr == ENC_CODERANGE_UNKNOWN) {
01965         res_encindex = str_encindex;
01966         res_cr = ENC_CODERANGE_UNKNOWN;
01967     }
01968     else if (str_cr == ENC_CODERANGE_7BIT) {
01969         if (ptr_cr == ENC_CODERANGE_7BIT) {
01970             res_encindex = str_encindex;
01971             res_cr = ENC_CODERANGE_7BIT;
01972         }
01973         else {
01974             res_encindex = ptr_encindex;
01975             res_cr = ptr_cr;
01976         }
01977     }
01978     else if (str_cr == ENC_CODERANGE_VALID) {
01979         res_encindex = str_encindex;
01980         if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01981             res_cr = str_cr;
01982         else
01983             res_cr = ptr_cr;
01984     }
01985     else { /* str_cr == ENC_CODERANGE_BROKEN */
01986         res_encindex = str_encindex;
01987         res_cr = str_cr;
01988         if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01989     }
01990 
01991     if (len < 0) {
01992         rb_raise(rb_eArgError, "negative string size (or size too big)");
01993     }
01994     str_buf_cat(str, ptr, len);
01995     ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01996     return str;
01997 }
01998 
01999 VALUE
02000 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
02001 {
02002     return rb_enc_cr_str_buf_cat(str, ptr, len,
02003         rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
02004 }
02005 
02006 VALUE
02007 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02008 {
02009     /* ptr must reference NUL terminated ASCII string. */
02010     int encindex = ENCODING_GET(str);
02011     rb_encoding *enc = rb_enc_from_index(encindex);
02012     if (rb_enc_asciicompat(enc)) {
02013         return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02014             encindex, ENC_CODERANGE_7BIT, 0);
02015     }
02016     else {
02017         char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02018         while (*ptr) {
02019             unsigned int c = (unsigned char)*ptr;
02020             int len = rb_enc_codelen(c, enc);
02021             rb_enc_mbcput(c, buf, enc);
02022             rb_enc_cr_str_buf_cat(str, buf, len,
02023                 encindex, ENC_CODERANGE_VALID, 0);
02024             ptr++;
02025         }
02026         return str;
02027     }
02028 }
02029 
02030 VALUE
02031 rb_str_buf_append(VALUE str, VALUE str2)
02032 {
02033     int str2_cr;
02034 
02035     str2_cr = ENC_CODERANGE(str2);
02036 
02037     rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02038         ENCODING_GET(str2), str2_cr, &str2_cr);
02039 
02040     OBJ_INFECT(str, str2);
02041     ENC_CODERANGE_SET(str2, str2_cr);
02042 
02043     return str;
02044 }
02045 
02046 VALUE
02047 rb_str_append(VALUE str, VALUE str2)
02048 {
02049     rb_encoding *enc;
02050     int cr, cr2;
02051     long len2;
02052 
02053     StringValue(str2);
02054     if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02055         long len = RSTRING_LEN(str) + len2;
02056         enc = rb_enc_check(str, str2);
02057         cr = ENC_CODERANGE(str);
02058         if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02059         rb_str_modify_expand(str, len2);
02060         memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02061                RSTRING_PTR(str2), len2+1);
02062         RSTRING(str)->as.heap.len = len;
02063         rb_enc_associate(str, enc);
02064         ENC_CODERANGE_SET(str, cr);
02065         OBJ_INFECT(str, str2);
02066         return str;
02067     }
02068     return rb_str_buf_append(str, str2);
02069 }
02070 
02071 /*
02072  *  call-seq:
02073  *     str << integer       -> str
02074  *     str.concat(integer)  -> str
02075  *     str << obj           -> str
02076  *     str.concat(obj)      -> str
02077  *
02078  *  Append---Concatenates the given object to <i>str</i>. If the object is a
02079  *  <code>Integer</code>, it is considered as a codepoint, and is converted
02080  *  to a character before concatenation.
02081  *
02082  *     a = "hello "
02083  *     a << "world"   #=> "hello world"
02084  *     a.concat(33)   #=> "hello world!"
02085  */
02086 
02087 VALUE
02088 rb_str_concat(VALUE str1, VALUE str2)
02089 {
02090     unsigned int code;
02091     rb_encoding *enc = STR_ENC_GET(str1);
02092 
02093     if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
02094         if (rb_num_to_uint(str2, &code) == 0) {
02095         }
02096         else if (FIXNUM_P(str2)) {
02097             rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02098         }
02099         else {
02100             rb_raise(rb_eRangeError, "bignum out of char range");
02101         }
02102     }
02103     else {
02104         return rb_str_append(str1, str2);
02105     }
02106 
02107     if (enc == rb_usascii_encoding()) {
02108         /* US-ASCII automatically extended to ASCII-8BIT */
02109         char buf[1];
02110         buf[0] = (char)code;
02111         if (code > 0xFF) {
02112             rb_raise(rb_eRangeError, "%u out of char range", code);
02113         }
02114         rb_str_cat(str1, buf, 1);
02115         if (code > 127) {
02116             rb_enc_associate(str1, rb_ascii8bit_encoding());
02117             ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02118         }
02119     }
02120     else {
02121         long pos = RSTRING_LEN(str1);
02122         int cr = ENC_CODERANGE(str1);
02123         int len;
02124         char *buf;
02125 
02126         switch (len = rb_enc_codelen(code, enc)) {
02127           case ONIGERR_INVALID_CODE_POINT_VALUE:
02128             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02129             break;
02130           case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02131           case 0:
02132             rb_raise(rb_eRangeError, "%u out of char range", code);
02133             break;
02134         }
02135         buf = ALLOCA_N(char, len + 1);
02136         rb_enc_mbcput(code, buf, enc);
02137         if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02138             rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02139         }
02140         rb_str_resize(str1, pos+len);
02141         strncpy(RSTRING_PTR(str1) + pos, buf, len);
02142         if (cr == ENC_CODERANGE_7BIT && code > 127)
02143             cr = ENC_CODERANGE_VALID;
02144         ENC_CODERANGE_SET(str1, cr);
02145     }
02146     return str1;
02147 }
02148 
02149 /*
02150  *  call-seq:
02151  *     str.prepend(other_str)  -> str
02152  *
02153  *  Prepend---Prepend the given string to <i>str</i>.
02154  *
02155  *  a = "world"
02156  *  a.prepend("hello ") #=> "hello world"
02157  *  a                   #=> "hello world"
02158  */
02159 
02160 static VALUE
02161 rb_str_prepend(VALUE str, VALUE str2)
02162 {
02163     StringValue(str2);
02164     StringValue(str);
02165     rb_str_update(str, 0L, 0L, str2);
02166     return str;
02167 }
02168 
02169 st_index_t
02170 rb_str_hash(VALUE str)
02171 {
02172     int e = ENCODING_GET(str);
02173     if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02174         e = 0;
02175     }
02176     return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02177 }
02178 
02179 int
02180 rb_str_hash_cmp(VALUE str1, VALUE str2)
02181 {
02182     long len;
02183 
02184     if (!rb_str_comparable(str1, str2)) return 1;
02185     if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02186         memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02187         return 0;
02188     }
02189     return 1;
02190 }
02191 
02192 /*
02193  * call-seq:
02194  *    str.hash   -> fixnum
02195  *
02196  * Return a hash based on the string's length and content.
02197  */
02198 
02199 static VALUE
02200 rb_str_hash_m(VALUE str)
02201 {
02202     st_index_t hval = rb_str_hash(str);
02203     return INT2FIX(hval);
02204 }
02205 
02206 #define lesser(a,b) (((a)>(b))?(b):(a))
02207 
02208 int
02209 rb_str_comparable(VALUE str1, VALUE str2)
02210 {
02211     int idx1, idx2;
02212     int rc1, rc2;
02213 
02214     if (RSTRING_LEN(str1) == 0) return TRUE;
02215     if (RSTRING_LEN(str2) == 0) return TRUE;
02216     idx1 = ENCODING_GET(str1);
02217     idx2 = ENCODING_GET(str2);
02218     if (idx1 == idx2) return TRUE;
02219     rc1 = rb_enc_str_coderange(str1);
02220     rc2 = rb_enc_str_coderange(str2);
02221     if (rc1 == ENC_CODERANGE_7BIT) {
02222         if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02223         if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02224             return TRUE;
02225     }
02226     if (rc2 == ENC_CODERANGE_7BIT) {
02227         if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02228             return TRUE;
02229     }
02230     return FALSE;
02231 }
02232 
02233 int
02234 rb_str_cmp(VALUE str1, VALUE str2)
02235 {
02236     long len1, len2;
02237     const char *ptr1, *ptr2;
02238     int retval;
02239 
02240     if (str1 == str2) return 0;
02241     RSTRING_GETMEM(str1, ptr1, len1);
02242     RSTRING_GETMEM(str2, ptr2, len2);
02243     if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02244         if (len1 == len2) {
02245             if (!rb_str_comparable(str1, str2)) {
02246                 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02247                     return 1;
02248                 return -1;
02249             }
02250             return 0;
02251         }
02252         if (len1 > len2) return 1;
02253         return -1;
02254     }
02255     if (retval > 0) return 1;
02256     return -1;
02257 }
02258 
02259 /* expect tail call optimization */
02260 static VALUE
02261 str_eql(const VALUE str1, const VALUE str2)
02262 {
02263     const long len = RSTRING_LEN(str1);
02264     const char *ptr1, *ptr2;
02265 
02266     if (len != RSTRING_LEN(str2)) return Qfalse;
02267     if (!rb_str_comparable(str1, str2)) return Qfalse;
02268     if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02269         return Qtrue;
02270     if (memcmp(ptr1, ptr2, len) == 0)
02271         return Qtrue;
02272     return Qfalse;
02273 }
02274 /*
02275  *  call-seq:
02276  *     str == obj   -> true or false
02277  *
02278  *  Equality---If <i>obj</i> is not a <code>String</code>, returns
02279  *  <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
02280  *  <code><=></code> <i>obj</i> returns zero.
02281  */
02282 
02283 VALUE
02284 rb_str_equal(VALUE str1, VALUE str2)
02285 {
02286     if (str1 == str2) return Qtrue;
02287     if (TYPE(str2) != T_STRING) {
02288         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02289             return Qfalse;
02290         }
02291         return rb_equal(str2, str1);
02292     }
02293     return str_eql(str1, str2);
02294 }
02295 
02296 /*
02297  * call-seq:
02298  *   str.eql?(other)   -> true or false
02299  *
02300  * Two strings are equal if they have the same length and content.
02301  */
02302 
02303 static VALUE
02304 rb_str_eql(VALUE str1, VALUE str2)
02305 {
02306     if (str1 == str2) return Qtrue;
02307     if (TYPE(str2) != T_STRING) return Qfalse;
02308     return str_eql(str1, str2);
02309 }
02310 
02311 /*
02312  *  call-seq:
02313  *     str <=> other_str   -> -1, 0, +1 or nil
02314  *
02315  *  Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if
02316  *  <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than
02317  *  <i>str</i>. If the strings are of different lengths, and the strings are
02318  *  equal when compared up to the shortest length, then the longer string is
02319  *  considered greater than the shorter one. In older versions of Ruby, setting
02320  *  <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
02321  *  in favor of using <code>String#casecmp</code>.
02322  *
02323  *  <code><=></code> is the basis for the methods <code><</code>,
02324  *  <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
02325  *  included from module <code>Comparable</code>.  The method
02326  *  <code>String#==</code> does not use <code>Comparable#==</code>.
02327  *
02328  *     "abcdef" <=> "abcde"     #=> 1
02329  *     "abcdef" <=> "abcdef"    #=> 0
02330  *     "abcdef" <=> "abcdefg"   #=> -1
02331  *     "abcdef" <=> "ABCDEF"    #=> 1
02332  */
02333 
02334 static VALUE
02335 rb_str_cmp_m(VALUE str1, VALUE str2)
02336 {
02337     long result;
02338 
02339     if (TYPE(str2) != T_STRING) {
02340         if (!rb_respond_to(str2, rb_intern("to_str"))) {
02341             return Qnil;
02342         }
02343         else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02344             return Qnil;
02345         }
02346         else {
02347             VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02348 
02349             if (NIL_P(tmp)) return Qnil;
02350             if (!FIXNUM_P(tmp)) {
02351                 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02352             }
02353             result = -FIX2LONG(tmp);
02354         }
02355     }
02356     else {
02357         result = rb_str_cmp(str1, str2);
02358     }
02359     return LONG2NUM(result);
02360 }
02361 
02362 /*
02363  *  call-seq:
02364  *     str.casecmp(other_str)   -> -1, 0, +1 or nil
02365  *
02366  *  Case-insensitive version of <code>String#<=></code>.
02367  *
02368  *     "abcdef".casecmp("abcde")     #=> 1
02369  *     "aBcDeF".casecmp("abcdef")    #=> 0
02370  *     "abcdef".casecmp("abcdefg")   #=> -1
02371  *     "abcdef".casecmp("ABCDEF")    #=> 0
02372  */
02373 
02374 static VALUE
02375 rb_str_casecmp(VALUE str1, VALUE str2)
02376 {
02377     long len;
02378     rb_encoding *enc;
02379     char *p1, *p1end, *p2, *p2end;
02380 
02381     StringValue(str2);
02382     enc = rb_enc_compatible(str1, str2);
02383     if (!enc) {
02384         return Qnil;
02385     }
02386 
02387     p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02388     p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02389     if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02390         while (p1 < p1end && p2 < p2end) {
02391             if (*p1 != *p2) {
02392                 unsigned int c1 = TOUPPER(*p1 & 0xff);
02393                 unsigned int c2 = TOUPPER(*p2 & 0xff);
02394                 if (c1 != c2)
02395                     return INT2FIX(c1 < c2 ? -1 : 1);
02396             }
02397             p1++;
02398             p2++;
02399         }
02400     }
02401     else {
02402         while (p1 < p1end && p2 < p2end) {
02403             int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02404             int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02405 
02406             if (0 <= c1 && 0 <= c2) {
02407                 c1 = TOUPPER(c1);
02408                 c2 = TOUPPER(c2);
02409                 if (c1 != c2)
02410                     return INT2FIX(c1 < c2 ? -1 : 1);
02411             }
02412             else {
02413                 int r;
02414                 l1 = rb_enc_mbclen(p1, p1end, enc);
02415                 l2 = rb_enc_mbclen(p2, p2end, enc);
02416                 len = l1 < l2 ? l1 : l2;
02417                 r = memcmp(p1, p2, len);
02418                 if (r != 0)
02419                     return INT2FIX(r < 0 ? -1 : 1);
02420                 if (l1 != l2)
02421                     return INT2FIX(l1 < l2 ? -1 : 1);
02422             }
02423             p1 += l1;
02424             p2 += l2;
02425         }
02426     }
02427     if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02428     if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02429     return INT2FIX(-1);
02430 }
02431 
02432 static long
02433 rb_str_index(VALUE str, VALUE sub, long offset)
02434 {
02435     long pos;
02436     char *s, *sptr, *e;
02437     long len, slen;
02438     rb_encoding *enc;
02439 
02440     enc = rb_enc_check(str, sub);
02441     if (is_broken_string(sub)) {
02442         return -1;
02443     }
02444     len = str_strlen(str, enc);
02445     slen = str_strlen(sub, enc);
02446     if (offset < 0) {
02447         offset += len;
02448         if (offset < 0) return -1;
02449     }
02450     if (len - offset < slen) return -1;
02451     s = RSTRING_PTR(str);
02452     e = s + RSTRING_LEN(str);
02453     if (offset) {
02454         offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02455         s += offset;
02456     }
02457     if (slen == 0) return offset;
02458     /* need proceed one character at a time */
02459     sptr = RSTRING_PTR(sub);
02460     slen = RSTRING_LEN(sub);
02461     len = RSTRING_LEN(str) - offset;
02462     for (;;) {
02463         char *t;
02464         pos = rb_memsearch(sptr, slen, s, len, enc);
02465         if (pos < 0) return pos;
02466         t = rb_enc_right_char_head(s, s+pos, e, enc);
02467         if (t == s + pos) break;
02468         if ((len -= t - s) <= 0) return -1;
02469         offset += t - s;
02470         s = t;
02471     }
02472     return pos + offset;
02473 }
02474 
02475 
02476 /*
02477  *  call-seq:
02478  *     str.index(substring [, offset])   -> fixnum or nil
02479  *     str.index(regexp [, offset])      -> fixnum or nil
02480  *
02481  *  Returns the index of the first occurrence of the given <i>substring</i> or
02482  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02483  *  found. If the second parameter is present, it specifies the position in the
02484  *  string to begin the search.
02485  *
02486  *     "hello".index('e')             #=> 1
02487  *     "hello".index('lo')            #=> 3
02488  *     "hello".index('a')             #=> nil
02489  *     "hello".index(?e)              #=> 1
02490  *     "hello".index(/[aeiou]/, -3)   #=> 4
02491  */
02492 
02493 static VALUE
02494 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02495 {
02496     VALUE sub;
02497     VALUE initpos;
02498     long pos;
02499 
02500     if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02501         pos = NUM2LONG(initpos);
02502     }
02503     else {
02504         pos = 0;
02505     }
02506     if (pos < 0) {
02507         pos += str_strlen(str, STR_ENC_GET(str));
02508         if (pos < 0) {
02509             if (TYPE(sub) == T_REGEXP) {
02510                 rb_backref_set(Qnil);
02511             }
02512             return Qnil;
02513         }
02514     }
02515 
02516     switch (TYPE(sub)) {
02517       case T_REGEXP:
02518         if (pos > str_strlen(str, STR_ENC_GET(str)))
02519             return Qnil;
02520         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02521                          rb_enc_check(str, sub), single_byte_optimizable(str));
02522 
02523         pos = rb_reg_search(sub, str, pos, 0);
02524         pos = rb_str_sublen(str, pos);
02525         break;
02526 
02527       default: {
02528         VALUE tmp;
02529 
02530         tmp = rb_check_string_type(sub);
02531         if (NIL_P(tmp)) {
02532             rb_raise(rb_eTypeError, "type mismatch: %s given",
02533                      rb_obj_classname(sub));
02534         }
02535         sub = tmp;
02536       }
02537         /* fall through */
02538       case T_STRING:
02539         pos = rb_str_index(str, sub, pos);
02540         pos = rb_str_sublen(str, pos);
02541         break;
02542     }
02543 
02544     if (pos == -1) return Qnil;
02545     return LONG2NUM(pos);
02546 }
02547 
02548 static long
02549 rb_str_rindex(VALUE str, VALUE sub, long pos)
02550 {
02551     long len, slen;
02552     char *s, *sbeg, *e, *t;
02553     rb_encoding *enc;
02554     int singlebyte = single_byte_optimizable(str);
02555 
02556     enc = rb_enc_check(str, sub);
02557     if (is_broken_string(sub)) {
02558         return -1;
02559     }
02560     len = str_strlen(str, enc);
02561     slen = str_strlen(sub, enc);
02562     /* substring longer than string */
02563     if (len < slen) return -1;
02564     if (len - pos < slen) {
02565         pos = len - slen;
02566     }
02567     if (len == 0) {
02568         return pos;
02569     }
02570     sbeg = RSTRING_PTR(str);
02571     e = RSTRING_END(str);
02572     t = RSTRING_PTR(sub);
02573     slen = RSTRING_LEN(sub);
02574     s = str_nth(sbeg, e, pos, enc, singlebyte);
02575     while (s) {
02576         if (memcmp(s, t, slen) == 0) {
02577             return pos;
02578         }
02579         if (pos == 0) break;
02580         pos--;
02581         s = rb_enc_prev_char(sbeg, s, e, enc);
02582     }
02583     return -1;
02584 }
02585 
02586 
02587 /*
02588  *  call-seq:
02589  *     str.rindex(substring [, fixnum])   -> fixnum or nil
02590  *     str.rindex(regexp [, fixnum])   -> fixnum or nil
02591  *
02592  *  Returns the index of the last occurrence of the given <i>substring</i> or
02593  *  pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
02594  *  found. If the second parameter is present, it specifies the position in the
02595  *  string to end the search---characters beyond this point will not be
02596  *  considered.
02597  *
02598  *     "hello".rindex('e')             #=> 1
02599  *     "hello".rindex('l')             #=> 3
02600  *     "hello".rindex('a')             #=> nil
02601  *     "hello".rindex(?e)              #=> 1
02602  *     "hello".rindex(/[aeiou]/, -2)   #=> 1
02603  */
02604 
02605 static VALUE
02606 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02607 {
02608     VALUE sub;
02609     VALUE vpos;
02610     rb_encoding *enc = STR_ENC_GET(str);
02611     long pos, len = str_strlen(str, enc);
02612 
02613     if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02614         pos = NUM2LONG(vpos);
02615         if (pos < 0) {
02616             pos += len;
02617             if (pos < 0) {
02618                 if (TYPE(sub) == T_REGEXP) {
02619                     rb_backref_set(Qnil);
02620                 }
02621                 return Qnil;
02622             }
02623         }
02624         if (pos > len) pos = len;
02625     }
02626     else {
02627         pos = len;
02628     }
02629 
02630     switch (TYPE(sub)) {
02631       case T_REGEXP:
02632         /* enc = rb_get_check(str, sub); */
02633         pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02634                          STR_ENC_GET(str), single_byte_optimizable(str));
02635 
02636         if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02637             pos = rb_reg_search(sub, str, pos, 1);
02638             pos = rb_str_sublen(str, pos);
02639         }
02640         if (pos >= 0) return LONG2NUM(pos);
02641         break;
02642 
02643       default: {
02644         VALUE tmp;
02645 
02646         tmp = rb_check_string_type(sub);
02647         if (NIL_P(tmp)) {
02648             rb_raise(rb_eTypeError, "type mismatch: %s given",
02649                      rb_obj_classname(sub));
02650         }
02651         sub = tmp;
02652       }
02653         /* fall through */
02654       case T_STRING:
02655         pos = rb_str_rindex(str, sub, pos);
02656         if (pos >= 0) return LONG2NUM(pos);
02657         break;
02658     }
02659     return Qnil;
02660 }
02661 
02662 /*
02663  *  call-seq:
02664  *     str =~ obj   -> fixnum or nil
02665  *
02666  *  Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
02667  *  against <i>str</i>,and returns the position the match starts, or
02668  *  <code>nil</code> if there is no match. Otherwise, invokes
02669  *  <i>obj.=~</i>, passing <i>str</i> as an argument. The default
02670  *  <code>=~</code> in <code>Object</code> returns <code>nil</code>.
02671  *
02672  *     "cat o' 9 tails" =~ /\d/   #=> 7
02673  *     "cat o' 9 tails" =~ 9      #=> nil
02674  */
02675 
02676 static VALUE
02677 rb_str_match(VALUE x, VALUE y)
02678 {
02679     switch (TYPE(y)) {
02680       case T_STRING:
02681         rb_raise(rb_eTypeError, "type mismatch: String given");
02682 
02683       case T_REGEXP:
02684         return rb_reg_match(y, x);
02685 
02686       default:
02687         return rb_funcall(y, rb_intern("=~"), 1, x);
02688     }
02689 }
02690 
02691 
02692 static VALUE get_pat(VALUE, int);
02693 
02694 
02695 /*
02696  *  call-seq:
02697  *     str.match(pattern)        -> matchdata or nil
02698  *     str.match(pattern, pos)   -> matchdata or nil
02699  *
02700  *  Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
02701  *  then invokes its <code>match</code> method on <i>str</i>.  If the second
02702  *  parameter is present, it specifies the position in the string to begin the
02703  *  search.
02704  *
02705  *     'hello'.match('(.)\1')      #=> #<MatchData "ll" 1:"l">
02706  *     'hello'.match('(.)\1')[0]   #=> "ll"
02707  *     'hello'.match(/(.)\1/)[0]   #=> "ll"
02708  *     'hello'.match('xx')         #=> nil
02709  *
02710  *  If a block is given, invoke the block with MatchData if match succeed, so
02711  *  that you can write
02712  *
02713  *     str.match(pat) {|m| ...}
02714  *
02715  *  instead of
02716  *
02717  *     if m = str.match(pat)
02718  *       ...
02719  *     end
02720  *
02721  *  The return value is a value from block execution in this case.
02722  */
02723 
02724 static VALUE
02725 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02726 {
02727     VALUE re, result;
02728     if (argc < 1)
02729        rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02730     re = argv[0];
02731     argv[0] = str;
02732     result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02733     if (!NIL_P(result) && rb_block_given_p()) {
02734         return rb_yield(result);
02735     }
02736     return result;
02737 }
02738 
02739 enum neighbor_char {
02740     NEIGHBOR_NOT_CHAR,
02741     NEIGHBOR_FOUND,
02742     NEIGHBOR_WRAPPED
02743 };
02744 
02745 static enum neighbor_char
02746 enc_succ_char(char *p, long len, rb_encoding *enc)
02747 {
02748     long i;
02749     int l;
02750     while (1) {
02751         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02752             p[i] = '\0';
02753         if (i < 0)
02754             return NEIGHBOR_WRAPPED;
02755         ++((unsigned char*)p)[i];
02756         l = rb_enc_precise_mbclen(p, p+len, enc);
02757         if (MBCLEN_CHARFOUND_P(l)) {
02758             l = MBCLEN_CHARFOUND_LEN(l);
02759             if (l == len) {
02760                 return NEIGHBOR_FOUND;
02761             }
02762             else {
02763                 memset(p+l, 0xff, len-l);
02764             }
02765         }
02766         if (MBCLEN_INVALID_P(l) && i < len-1) {
02767             long len2;
02768             int l2;
02769             for (len2 = len-1; 0 < len2; len2--) {
02770                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02771                 if (!MBCLEN_INVALID_P(l2))
02772                     break;
02773             }
02774             memset(p+len2+1, 0xff, len-(len2+1));
02775         }
02776     }
02777 }
02778 
02779 static enum neighbor_char
02780 enc_pred_char(char *p, long len, rb_encoding *enc)
02781 {
02782     long i;
02783     int l;
02784     while (1) {
02785         for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02786             p[i] = '\xff';
02787         if (i < 0)
02788             return NEIGHBOR_WRAPPED;
02789         --((unsigned char*)p)[i];
02790         l = rb_enc_precise_mbclen(p, p+len, enc);
02791         if (MBCLEN_CHARFOUND_P(l)) {
02792             l = MBCLEN_CHARFOUND_LEN(l);
02793             if (l == len) {
02794                 return NEIGHBOR_FOUND;
02795             }
02796             else {
02797                 memset(p+l, 0, len-l);
02798             }
02799         }
02800         if (MBCLEN_INVALID_P(l) && i < len-1) {
02801             long len2;
02802             int l2;
02803             for (len2 = len-1; 0 < len2; len2--) {
02804                 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02805                 if (!MBCLEN_INVALID_P(l2))
02806                     break;
02807             }
02808             memset(p+len2+1, 0, len-(len2+1));
02809         }
02810     }
02811 }
02812 
02813 /*
02814   overwrite +p+ by succeeding letter in +enc+ and returns
02815   NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
02816   When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
02817   assuming each ranges are successive, and mbclen
02818   never change in each ranges.
02819   NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
02820   character.
02821  */
02822 static enum neighbor_char
02823 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02824 {
02825     enum neighbor_char ret;
02826     unsigned int c;
02827     int ctype;
02828     int range;
02829     char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02830 
02831     c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02832     if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02833         ctype = ONIGENC_CTYPE_DIGIT;
02834     else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02835         ctype = ONIGENC_CTYPE_ALPHA;
02836     else
02837         return NEIGHBOR_NOT_CHAR;
02838 
02839     MEMCPY(save, p, char, len);
02840     ret = enc_succ_char(p, len, enc);
02841     if (ret == NEIGHBOR_FOUND) {
02842         c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02843         if (rb_enc_isctype(c, ctype, enc))
02844             return NEIGHBOR_FOUND;
02845     }
02846     MEMCPY(p, save, char, len);
02847     range = 1;
02848     while (1) {
02849         MEMCPY(save, p, char, len);
02850         ret = enc_pred_char(p, len, enc);
02851         if (ret == NEIGHBOR_FOUND) {
02852             c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02853             if (!rb_enc_isctype(c, ctype, enc)) {
02854                 MEMCPY(p, save, char, len);
02855                 break;
02856             }
02857         }
02858         else {
02859             MEMCPY(p, save, char, len);
02860             break;
02861         }
02862         range++;
02863     }
02864     if (range == 1) {
02865         return NEIGHBOR_NOT_CHAR;
02866     }
02867 
02868     if (ctype != ONIGENC_CTYPE_DIGIT) {
02869         MEMCPY(carry, p, char, len);
02870         return NEIGHBOR_WRAPPED;
02871     }
02872 
02873     MEMCPY(carry, p, char, len);
02874     enc_succ_char(carry, len, enc);
02875     return NEIGHBOR_WRAPPED;
02876 }
02877 
02878 
02879 /*
02880  *  call-seq:
02881  *     str.succ   -> new_str
02882  *     str.next   -> new_str
02883  *
02884  *  Returns the successor to <i>str</i>. The successor is calculated by
02885  *  incrementing characters starting from the rightmost alphanumeric (or
02886  *  the rightmost character if there are no alphanumerics) in the
02887  *  string. Incrementing a digit always results in another digit, and
02888  *  incrementing a letter results in another letter of the same case.
02889  *  Incrementing nonalphanumerics uses the underlying character set's
02890  *  collating sequence.
02891  *
02892  *  If the increment generates a ``carry,'' the character to the left of
02893  *  it is incremented. This process repeats until there is no carry,
02894  *  adding an additional character if necessary.
02895  *
02896  *     "abcd".succ        #=> "abce"
02897  *     "THX1138".succ     #=> "THX1139"
02898  *     "<<koala>>".succ   #=> "<<koalb>>"
02899  *     "1999zzz".succ     #=> "2000aaa"
02900  *     "ZZZ9999".succ     #=> "AAAA0000"
02901  *     "***".succ         #=> "**+"
02902  */
02903 
02904 VALUE
02905 rb_str_succ(VALUE orig)
02906 {
02907     rb_encoding *enc;
02908     VALUE str;
02909     char *sbeg, *s, *e, *last_alnum = 0;
02910     int c = -1;
02911     long l;
02912     char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02913     long carry_pos = 0, carry_len = 1;
02914     enum neighbor_char neighbor = NEIGHBOR_FOUND;
02915 
02916     str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02917     rb_enc_cr_str_copy_for_substr(str, orig);
02918     OBJ_INFECT(str, orig);
02919     if (RSTRING_LEN(str) == 0) return str;
02920 
02921     enc = STR_ENC_GET(orig);
02922     sbeg = RSTRING_PTR(str);
02923     s = e = sbeg + RSTRING_LEN(str);
02924 
02925     while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02926         if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02927             if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02928                 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02929                 s = last_alnum;
02930                 break;
02931             }
02932         }
02933         if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02934         neighbor = enc_succ_alnum_char(s, l, enc, carry);
02935         switch (neighbor) {
02936           case NEIGHBOR_NOT_CHAR:
02937             continue;
02938           case NEIGHBOR_FOUND:
02939             return str;
02940           case NEIGHBOR_WRAPPED:
02941             last_alnum = s;
02942             break;
02943         }
02944         c = 1;
02945         carry_pos = s - sbeg;
02946         carry_len = l;
02947     }
02948     if (c == -1) {              /* str contains no alnum */
02949         s = e;
02950         while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02951             enum neighbor_char neighbor;
02952             if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02953             neighbor = enc_succ_char(s, l, enc);
02954             if (neighbor == NEIGHBOR_FOUND)
02955                 return str;
02956             if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02957                 /* wrapped to \0...\0.  search next valid char. */
02958                 enc_succ_char(s, l, enc);
02959             }
02960             if (!rb_enc_asciicompat(enc)) {
02961                 MEMCPY(carry, s, char, l);
02962                 carry_len = l;
02963             }
02964             carry_pos = s - sbeg;
02965         }
02966     }
02967     RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02968     s = RSTRING_PTR(str) + carry_pos;
02969     memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02970     memmove(s, carry, carry_len);
02971     STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02972     RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02973     rb_enc_str_coderange(str);
02974     return str;
02975 }
02976 
02977 
02978 /*
02979  *  call-seq:
02980  *     str.succ!   -> str
02981  *     str.next!   -> str
02982  *
02983  *  Equivalent to <code>String#succ</code>, but modifies the receiver in
02984  *  place.
02985  */
02986 
02987 static VALUE
02988 rb_str_succ_bang(VALUE str)
02989 {
02990     rb_str_shared_replace(str, rb_str_succ(str));
02991 
02992     return str;
02993 }
02994 
02995 
02996 /*
02997  *  call-seq:
02998  *     str.upto(other_str, exclusive=false) {|s| block }   -> str
02999  *     str.upto(other_str, exclusive=false)                -> an_enumerator
03000  *
03001  *  Iterates through successive values, starting at <i>str</i> and
03002  *  ending at <i>other_str</i> inclusive, passing each value in turn to
03003  *  the block. The <code>String#succ</code> method is used to generate
03004  *  each value.  If optional second argument exclusive is omitted or is false,
03005  *  the last value will be included; otherwise it will be excluded.
03006  *
03007  *  If no block is given, an enumerator is returned instead.
03008  *
03009  *     "a8".upto("b6") {|s| print s, ' ' }
03010  *     for s in "a8".."b6"
03011  *       print s, ' '
03012  *     end
03013  *
03014  *  <em>produces:</em>
03015  *
03016  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03017  *     a8 a9 b0 b1 b2 b3 b4 b5 b6
03018  *
03019  *  If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
03020  *  both are recognized as decimal numbers. In addition, the width of
03021  *  string (e.g. leading zeros) is handled appropriately.
03022  *
03023  *     "9".upto("11").to_a   #=> ["9", "10", "11"]
03024  *     "25".upto("5").to_a   #=> []
03025  *     "07".upto("11").to_a  #=> ["07", "08", "09", "10", "11"]
03026  */
03027 
03028 static VALUE
03029 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03030 {
03031     VALUE end, exclusive;
03032     VALUE current, after_end;
03033     ID succ;
03034     int n, excl, ascii;
03035     rb_encoding *enc;
03036 
03037     rb_scan_args(argc, argv, "11", &end, &exclusive);
03038     RETURN_ENUMERATOR(beg, argc, argv);
03039     excl = RTEST(exclusive);
03040     CONST_ID(succ, "succ");
03041     StringValue(end);
03042     enc = rb_enc_check(beg, end);
03043     ascii = (is_ascii_string(beg) && is_ascii_string(end));
03044     /* single character */
03045     if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03046         char c = RSTRING_PTR(beg)[0];
03047         char e = RSTRING_PTR(end)[0];
03048 
03049         if (c > e || (excl && c == e)) return beg;
03050         for (;;) {
03051             rb_yield(rb_enc_str_new(&c, 1, enc));
03052             if (!excl && c == e) break;
03053             c++;
03054             if (excl && c == e) break;
03055         }
03056         return beg;
03057     }
03058     /* both edges are all digits */
03059     if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03060         char *s, *send;
03061         VALUE b, e;
03062         int width;
03063 
03064         s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03065         width = rb_long2int(send - s);
03066         while (s < send) {
03067             if (!ISDIGIT(*s)) goto no_digits;
03068             s++;
03069         }
03070         s = RSTRING_PTR(end); send = RSTRING_END(end);
03071         while (s < send) {
03072             if (!ISDIGIT(*s)) goto no_digits;
03073             s++;
03074         }
03075         b = rb_str_to_inum(beg, 10, FALSE);
03076         e = rb_str_to_inum(end, 10, FALSE);
03077         if (FIXNUM_P(b) && FIXNUM_P(e)) {
03078             long bi = FIX2LONG(b);
03079             long ei = FIX2LONG(e);
03080             rb_encoding *usascii = rb_usascii_encoding();
03081 
03082             while (bi <= ei) {
03083                 if (excl && bi == ei) break;
03084                 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03085                 bi++;
03086             }
03087         }
03088         else {
03089             ID op = excl ? '<' : rb_intern("<=");
03090             VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03091 
03092             args[0] = INT2FIX(width);
03093             while (rb_funcall(b, op, 1, e)) {
03094                 args[1] = b;
03095                 rb_yield(rb_str_format(numberof(args), args, fmt));
03096                 b = rb_funcall(b, succ, 0, 0);
03097             }
03098         }
03099         return beg;
03100     }
03101     /* normal case */
03102   no_digits:
03103     n = rb_str_cmp(beg, end);
03104     if (n > 0 || (excl && n == 0)) return beg;
03105 
03106     after_end = rb_funcall(end, succ, 0, 0);
03107     current = rb_str_dup(beg);
03108     while (!rb_str_equal(current, after_end)) {
03109         VALUE next = Qnil;
03110         if (excl || !rb_str_equal(current, end))
03111             next = rb_funcall(current, succ, 0, 0);
03112         rb_yield(current);
03113         if (NIL_P(next)) break;
03114         current = next;
03115         StringValue(current);
03116         if (excl && rb_str_equal(current, end)) break;
03117         if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03118             break;
03119     }
03120 
03121     return beg;
03122 }
03123 
03124 static VALUE
03125 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03126 {
03127     if (rb_reg_search(re, str, 0, 0) >= 0) {
03128         VALUE match = rb_backref_get();
03129         int nth = rb_reg_backref_number(match, backref);
03130         return rb_reg_nth_match(nth, match);
03131     }
03132     return Qnil;
03133 }
03134 
03135 static VALUE
03136 rb_str_aref(VALUE str, VALUE indx)
03137 {
03138     long idx;
03139 
03140     switch (TYPE(indx)) {
03141       case T_FIXNUM:
03142         idx = FIX2LONG(indx);
03143 
03144       num_index:
03145         str = rb_str_substr(str, idx, 1);
03146         if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03147         return str;
03148 
03149       case T_REGEXP:
03150         return rb_str_subpat(str, indx, INT2FIX(0));
03151 
03152       case T_STRING:
03153         if (rb_str_index(str, indx, 0) != -1)
03154             return rb_str_dup(indx);
03155         return Qnil;
03156 
03157       default:
03158         /* check if indx is Range */
03159         {
03160             long beg, len;
03161             VALUE tmp;
03162 
03163             len = str_strlen(str, STR_ENC_GET(str));
03164             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03165               case Qfalse:
03166                 break;
03167               case Qnil:
03168                 return Qnil;
03169               default:
03170                 tmp = rb_str_substr(str, beg, len);
03171                 return tmp;
03172             }
03173         }
03174         idx = NUM2LONG(indx);
03175         goto num_index;
03176     }
03177     return Qnil;                /* not reached */
03178 }
03179 
03180 
03181 /*
03182  *  call-seq:
03183  *     str[fixnum]                 -> new_str or nil
03184  *     str[fixnum, fixnum]         -> new_str or nil
03185  *     str[range]                  -> new_str or nil
03186  *     str[regexp]                 -> new_str or nil
03187  *     str[regexp, fixnum]         -> new_str or nil
03188  *     str[other_str]              -> new_str or nil
03189  *     str.slice(fixnum)           -> new_str or nil
03190  *     str.slice(fixnum, fixnum)   -> new_str or nil
03191  *     str.slice(range)            -> new_str or nil
03192  *     str.slice(regexp)           -> new_str or nil
03193  *     str.slice(regexp, fixnum)   -> new_str or nil
03194  *     str.slice(regexp, capname)  -> new_str or nil
03195  *     str.slice(other_str)        -> new_str or nil
03196  *
03197  *  Element Reference---If passed a single <code>Fixnum</code>, returns a
03198  *  substring of one character at that position. If passed two <code>Fixnum</code>
03199  *  objects, returns a substring starting at the offset given by the first, and
03200  *  with a length given by the second. If passed a range, its beginning and end
03201  *  are interpreted as offsets delimiting the substring to be returned. In all
03202  *  three cases, if an offset is negative, it is counted from the end of <i>str</i>.
03203  *  Returns <code>nil</code> if the initial offset falls outside the string or
03204  *  the length is negative.
03205  *
03206  *  If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
03207  *  returned. If a numeric or name parameter follows the regular expression, that
03208  *  component of the <code>MatchData</code> is returned instead. If a
03209  *  <code>String</code> is given, that string is returned if it occurs in
03210  *  <i>str</i>. In both cases, <code>nil</code> is returned if there is no
03211  *  match.
03212  *
03213  *     a = "hello there"
03214  *     a[1]                   #=> "e"
03215  *     a[2, 3]                #=> "llo"
03216  *     a[2..3]                #=> "ll"
03217  *     a[-3, 2]               #=> "er"
03218  *     a[7..-2]               #=> "her"
03219  *     a[-4..-2]              #=> "her"
03220  *     a[-2..-4]              #=> ""
03221  *     a[12..-1]              #=> nil
03222  *     a[/[aeiou](.)\1/]      #=> "ell"
03223  *     a[/[aeiou](.)\1/, 0]   #=> "ell"
03224  *     a[/[aeiou](.)\1/, 1]   #=> "l"
03225  *     a[/[aeiou](.)\1/, 2]   #=> nil
03226  *     a["lo"]                #=> "lo"
03227  *     a["bye"]               #=> nil
03228  */
03229 
03230 static VALUE
03231 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03232 {
03233     if (argc == 2) {
03234         if (TYPE(argv[0]) == T_REGEXP) {
03235             return rb_str_subpat(str, argv[0], argv[1]);
03236         }
03237         return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03238     }
03239     if (argc != 1) {
03240         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03241     }
03242     return rb_str_aref(str, argv[0]);
03243 }
03244 
03245 VALUE
03246 rb_str_drop_bytes(VALUE str, long len)
03247 {
03248     char *ptr = RSTRING_PTR(str);
03249     long olen = RSTRING_LEN(str), nlen;
03250 
03251     str_modifiable(str);
03252     if (len > olen) len = olen;
03253     nlen = olen - len;
03254     if (nlen <= RSTRING_EMBED_LEN_MAX) {
03255         char *oldptr = ptr;
03256         int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03257         STR_SET_EMBED(str);
03258         STR_SET_EMBED_LEN(str, nlen);
03259         ptr = RSTRING(str)->as.ary;
03260         memmove(ptr, oldptr + len, nlen);
03261         if (fl == STR_NOEMBED) xfree(oldptr);
03262     }
03263     else {
03264         if (!STR_SHARED_P(str)) rb_str_new4(str);
03265         ptr = RSTRING(str)->as.heap.ptr += len;
03266         RSTRING(str)->as.heap.len = nlen;
03267     }
03268     ptr[nlen] = 0;
03269     ENC_CODERANGE_CLEAR(str);
03270     return str;
03271 }
03272 
03273 static void
03274 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03275 {
03276     if (beg == 0 && RSTRING_LEN(val) == 0) {
03277         rb_str_drop_bytes(str, len);
03278         OBJ_INFECT(str, val);
03279         return;
03280     }
03281 
03282     rb_str_modify(str);
03283     if (len < RSTRING_LEN(val)) {
03284         /* expand string */
03285         RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03286     }
03287 
03288     if (RSTRING_LEN(val) != len) {
03289         memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03290                 RSTRING_PTR(str) + beg + len,
03291                 RSTRING_LEN(str) - (beg + len));
03292     }
03293     if (RSTRING_LEN(val) < beg && len < 0) {
03294         MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03295     }
03296     if (RSTRING_LEN(val) > 0) {
03297         memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03298     }
03299     STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03300     if (RSTRING_PTR(str)) {
03301         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03302     }
03303     OBJ_INFECT(str, val);
03304 }
03305 
03306 static void
03307 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03308 {
03309     long slen;
03310     char *p, *e;
03311     rb_encoding *enc;
03312     int singlebyte = single_byte_optimizable(str);
03313     int cr;
03314 
03315     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03316 
03317     StringValue(val);
03318     enc = rb_enc_check(str, val);
03319     slen = str_strlen(str, enc);
03320 
03321     if (slen < beg) {
03322       out_of_range:
03323         rb_raise(rb_eIndexError, "index %ld out of string", beg);
03324     }
03325     if (beg < 0) {
03326         if (-beg > slen) {
03327             goto out_of_range;
03328         }
03329         beg += slen;
03330     }
03331     if (slen < len || slen < beg + len) {
03332         len = slen - beg;
03333     }
03334     str_modify_keep_cr(str);
03335     p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03336     if (!p) p = RSTRING_END(str);
03337     e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03338     if (!e) e = RSTRING_END(str);
03339     /* error check */
03340     beg = p - RSTRING_PTR(str); /* physical position */
03341     len = e - p;                /* physical length */
03342     rb_str_splice_0(str, beg, len, val);
03343     rb_enc_associate(str, enc);
03344     cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03345     if (cr != ENC_CODERANGE_BROKEN)
03346         ENC_CODERANGE_SET(str, cr);
03347 }
03348 
03349 void
03350 rb_str_update(VALUE str, long beg, long len, VALUE val)
03351 {
03352     rb_str_splice(str, beg, len, val);
03353 }
03354 
03355 static void
03356 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03357 {
03358     int nth;
03359     VALUE match;
03360     long start, end, len;
03361     rb_encoding *enc;
03362     struct re_registers *regs;
03363 
03364     if (rb_reg_search(re, str, 0, 0) < 0) {
03365         rb_raise(rb_eIndexError, "regexp not matched");
03366     }
03367     match = rb_backref_get();
03368     nth = rb_reg_backref_number(match, backref);
03369     regs = RMATCH_REGS(match);
03370     if (nth >= regs->num_regs) {
03371       out_of_range:
03372         rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03373     }
03374     if (nth < 0) {
03375         if (-nth >= regs->num_regs) {
03376             goto out_of_range;
03377         }
03378         nth += regs->num_regs;
03379     }
03380 
03381     start = BEG(nth);
03382     if (start == -1) {
03383         rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03384     }
03385     end = END(nth);
03386     len = end - start;
03387     StringValue(val);
03388     enc = rb_enc_check(str, val);
03389     rb_str_splice_0(str, start, len, val);
03390     rb_enc_associate(str, enc);
03391 }
03392 
03393 static VALUE
03394 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03395 {
03396     long idx, beg;
03397 
03398     switch (TYPE(indx)) {
03399       case T_FIXNUM:
03400         idx = FIX2LONG(indx);
03401       num_index:
03402         rb_str_splice(str, idx, 1, val);
03403         return val;
03404 
03405       case T_REGEXP:
03406         rb_str_subpat_set(str, indx, INT2FIX(0), val);
03407         return val;
03408 
03409       case T_STRING:
03410         beg = rb_str_index(str, indx, 0);
03411         if (beg < 0) {
03412             rb_raise(rb_eIndexError, "string not matched");
03413         }
03414         beg = rb_str_sublen(str, beg);
03415         rb_str_splice(str, beg, str_strlen(indx, 0), val);
03416         return val;
03417 
03418       default:
03419         /* check if indx is Range */
03420         {
03421             long beg, len;
03422             if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03423                 rb_str_splice(str, beg, len, val);
03424                 return val;
03425             }
03426         }
03427         idx = NUM2LONG(indx);
03428         goto num_index;
03429     }
03430 }
03431 
03432 /*
03433  *  call-seq:
03434  *     str[fixnum] = new_str
03435  *     str[fixnum, fixnum] = new_str
03436  *     str[range] = aString
03437  *     str[regexp] = new_str
03438  *     str[regexp, fixnum] = new_str
03439  *     str[regexp, name] = new_str
03440  *     str[other_str] = new_str
03441  *
03442  *  Element Assignment---Replaces some or all of the content of <i>str</i>. The
03443  *  portion of the string affected is determined using the same criteria as
03444  *  <code>String#[]</code>. If the replacement string is not the same length as
03445  *  the text it is replacing, the string will be adjusted accordingly. If the
03446  *  regular expression or string is used as the index doesn't match a position
03447  *  in the string, <code>IndexError</code> is raised. If the regular expression
03448  *  form is used, the optional second <code>Fixnum</code> allows you to specify
03449  *  which portion of the match to replace (effectively using the
03450  *  <code>MatchData</code> indexing rules. The forms that take a
03451  *  <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
03452  *  out of range; the <code>Range</code> form will raise a
03453  *  <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
03454  *  forms will silently ignore the assignment.
03455  */
03456 
03457 static VALUE
03458 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03459 {
03460     if (argc == 3) {
03461         if (TYPE(argv[0]) == T_REGEXP) {
03462             rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03463         }
03464         else {
03465             rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03466         }
03467         return argv[2];
03468     }
03469     if (argc != 2) {
03470         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03471     }
03472     return rb_str_aset(str, argv[0], argv[1]);
03473 }
03474 
03475 /*
03476  *  call-seq:
03477  *     str.insert(index, other_str)   -> str
03478  *
03479  *  Inserts <i>other_str</i> before the character at the given
03480  *  <i>index</i>, modifying <i>str</i>. Negative indices count from the
03481  *  end of the string, and insert <em>after</em> the given character.
03482  *  The intent is insert <i>aString</i> so that it starts at the given
03483  *  <i>index</i>.
03484  *
03485  *     "abcd".insert(0, 'X')    #=> "Xabcd"
03486  *     "abcd".insert(3, 'X')    #=> "abcXd"
03487  *     "abcd".insert(4, 'X')    #=> "abcdX"
03488  *     "abcd".insert(-3, 'X')   #=> "abXcd"
03489  *     "abcd".insert(-1, 'X')   #=> "abcdX"
03490  */
03491 
03492 static VALUE
03493 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03494 {
03495     long pos = NUM2LONG(idx);
03496 
03497     if (pos == -1) {
03498         return rb_str_append(str, str2);
03499     }
03500     else if (pos < 0) {
03501         pos++;
03502     }
03503     rb_str_splice(str, pos, 0, str2);
03504     return str;
03505 }
03506 
03507 
03508 /*
03509  *  call-seq:
03510  *     str.slice!(fixnum)           -> fixnum or nil
03511  *     str.slice!(fixnum, fixnum)   -> new_str or nil
03512  *     str.slice!(range)            -> new_str or nil
03513  *     str.slice!(regexp)           -> new_str or nil
03514  *     str.slice!(other_str)        -> new_str or nil
03515  *
03516  *  Deletes the specified portion from <i>str</i>, and returns the portion
03517  *  deleted.
03518  *
03519  *     string = "this is a string"
03520  *     string.slice!(2)        #=> "i"
03521  *     string.slice!(3..6)     #=> " is "
03522  *     string.slice!(/s.*t/)   #=> "sa st"
03523  *     string.slice!("r")      #=> "r"
03524  *     string                  #=> "thing"
03525  */
03526 
03527 static VALUE
03528 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03529 {
03530     VALUE result;
03531     VALUE buf[3];
03532     int i;
03533 
03534     if (argc < 1 || 2 < argc) {
03535         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03536     }
03537     for (i=0; i<argc; i++) {
03538         buf[i] = argv[i];
03539     }
03540     str_modify_keep_cr(str);
03541     result = rb_str_aref_m(argc, buf, str);
03542     if (!NIL_P(result)) {
03543         buf[i] = rb_str_new(0,0);
03544         rb_str_aset_m(argc+1, buf, str);
03545     }
03546     return result;
03547 }
03548 
03549 static VALUE
03550 get_pat(VALUE pat, int quote)
03551 {
03552     VALUE val;
03553 
03554     switch (TYPE(pat)) {
03555       case T_REGEXP:
03556         return pat;
03557 
03558       case T_STRING:
03559         break;
03560 
03561       default:
03562         val = rb_check_string_type(pat);
03563         if (NIL_P(val)) {
03564             Check_Type(pat, T_REGEXP);
03565         }
03566         pat = val;
03567     }
03568 
03569     if (quote) {
03570         pat = rb_reg_quote(pat);
03571     }
03572 
03573     return rb_reg_regcomp(pat);
03574 }
03575 
03576 
03577 /*
03578  *  call-seq:
03579  *     str.sub!(pattern, replacement)          -> str or nil
03580  *     str.sub!(pattern) {|match| block }      -> str or nil
03581  *
03582  *  Performs the substitutions of <code>String#sub</code> in place,
03583  *  returning <i>str</i>, or <code>nil</code> if no substitutions were
03584  *  performed.
03585  */
03586 
03587 static VALUE
03588 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03589 {
03590     VALUE pat, repl, hash = Qnil;
03591     int iter = 0;
03592     int tainted = 0;
03593     int untrusted = 0;
03594     long plen;
03595 
03596     if (argc == 1 && rb_block_given_p()) {
03597         iter = 1;
03598     }
03599     else if (argc == 2) {
03600         repl = argv[1];
03601         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03602         if (NIL_P(hash)) {
03603             StringValue(repl);
03604         }
03605         if (OBJ_TAINTED(repl)) tainted = 1;
03606         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03607     }
03608     else {
03609         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03610     }
03611 
03612     pat = get_pat(argv[0], 1);
03613     str_modifiable(str);
03614     if (rb_reg_search(pat, str, 0, 0) >= 0) {
03615         rb_encoding *enc;
03616         int cr = ENC_CODERANGE(str);
03617         VALUE match = rb_backref_get();
03618         struct re_registers *regs = RMATCH_REGS(match);
03619         long beg0 = BEG(0);
03620         long end0 = END(0);
03621         char *p, *rp;
03622         long len, rlen;
03623 
03624         if (iter || !NIL_P(hash)) {
03625             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03626 
03627             if (iter) {
03628                 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03629             }
03630             else {
03631                 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03632                 repl = rb_obj_as_string(repl);
03633             }
03634             str_mod_check(str, p, len);
03635             rb_check_frozen(str);
03636         }
03637         else {
03638             repl = rb_reg_regsub(repl, str, regs, pat);
03639         }
03640         enc = rb_enc_compatible(str, repl);
03641         if (!enc) {
03642             rb_encoding *str_enc = STR_ENC_GET(str);
03643             p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03644             if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03645                 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03646                 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03647                          rb_enc_name(str_enc),
03648                          rb_enc_name(STR_ENC_GET(repl)));
03649             }
03650             enc = STR_ENC_GET(repl);
03651         }
03652         rb_str_modify(str);
03653         rb_enc_associate(str, enc);
03654         if (OBJ_TAINTED(repl)) tainted = 1;
03655         if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03656         if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03657             int cr2 = ENC_CODERANGE(repl);
03658             if (cr2 == ENC_CODERANGE_BROKEN ||
03659                 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03660                 cr = ENC_CODERANGE_UNKNOWN;
03661             else
03662                 cr = cr2;
03663         }
03664         plen = end0 - beg0;
03665         rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03666         len = RSTRING_LEN(str);
03667         if (rlen > plen) {
03668             RESIZE_CAPA(str, len + rlen - plen);
03669         }
03670         p = RSTRING_PTR(str);
03671         if (rlen != plen) {
03672             memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03673         }
03674         memcpy(p + beg0, rp, rlen);
03675         len += rlen - plen;
03676         STR_SET_LEN(str, len);
03677         RSTRING_PTR(str)[len] = '\0';
03678         ENC_CODERANGE_SET(str, cr);
03679         if (tainted) OBJ_TAINT(str);
03680         if (untrusted) OBJ_UNTRUST(str);
03681 
03682         return str;
03683     }
03684     return Qnil;
03685 }
03686 
03687 
03688 /*
03689  *  call-seq:
03690  *     str.sub(pattern, replacement)         -> new_str
03691  *     str.sub(pattern, hash)                -> new_str
03692  *     str.sub(pattern) {|match| block }     -> new_str
03693  *
03694  *  Returns a copy of <i>str</i> with the <em>first</em> occurrence of
03695  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03696  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03697  *  regular expression metacharacters it contains will be interpreted
03698  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03699  *  instead of a digit.
03700  *
03701  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03702  *  the matched text. It may contain back-references to the pattern's capture
03703  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03704  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03705  *  double-quoted string, both back-references must be preceded by an
03706  *  additional backslash. However, within <i>replacement</i> the special match
03707  *  variables, such as <code>&$</code>, will not refer to the current match.
03708  *
03709  *  If the second argument is a <code>Hash</code>, and the matched text is one
03710  *  of its keys, the corresponding value is the replacement string.
03711  *
03712  *  In the block form, the current match string is passed in as a parameter,
03713  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03714  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03715  *  returned by the block will be substituted for the match on each call.
03716  *
03717  *  The result inherits any tainting in the original string or any supplied
03718  *  replacement string.
03719  *
03720  *     "hello".sub(/[aeiou]/, '*')                  #=> "h*llo"
03721  *     "hello".sub(/([aeiou])/, '<\1>')             #=> "h<e>llo"
03722  *     "hello".sub(/./) {|s| s.ord.to_s + ' ' }     #=> "104 ello"
03723  *     "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*')  #=> "h*e*llo"
03724  *     'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
03725  *      #=> "Is /bin/bash your preferred shell?"
03726  */
03727 
03728 static VALUE
03729 rb_str_sub(int argc, VALUE *argv, VALUE str)
03730 {
03731     str = rb_str_dup(str);
03732     rb_str_sub_bang(argc, argv, str);
03733     return str;
03734 }
03735 
03736 static VALUE
03737 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03738 {
03739     VALUE pat, val, repl, match, dest, hash = Qnil;
03740     struct re_registers *regs;
03741     long beg, n;
03742     long beg0, end0;
03743     long offset, blen, slen, len, last;
03744     int iter = 0;
03745     char *sp, *cp;
03746     int tainted = 0;
03747     rb_encoding *str_enc;
03748 
03749     switch (argc) {
03750       case 1:
03751         RETURN_ENUMERATOR(str, argc, argv);
03752         iter = 1;
03753         break;
03754       case 2:
03755         repl = argv[1];
03756         hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03757         if (NIL_P(hash)) {
03758             StringValue(repl);
03759         }
03760         if (OBJ_TAINTED(repl)) tainted = 1;
03761         break;
03762       default:
03763         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03764     }
03765 
03766     pat = get_pat(argv[0], 1);
03767     beg = rb_reg_search(pat, str, 0, 0);
03768     if (beg < 0) {
03769         if (bang) return Qnil;  /* no match, no substitution */
03770         return rb_str_dup(str);
03771     }
03772 
03773     offset = 0;
03774     n = 0;
03775     blen = RSTRING_LEN(str) + 30; /* len + margin */
03776     dest = rb_str_buf_new(blen);
03777     sp = RSTRING_PTR(str);
03778     slen = RSTRING_LEN(str);
03779     cp = sp;
03780     str_enc = STR_ENC_GET(str);
03781     rb_enc_associate(dest, str_enc);
03782     ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03783 
03784     do {
03785         n++;
03786         match = rb_backref_get();
03787         regs = RMATCH_REGS(match);
03788         beg0 = BEG(0);
03789         end0 = END(0);
03790         if (iter || !NIL_P(hash)) {
03791             if (iter) {
03792                 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03793             }
03794             else {
03795                 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03796                 val = rb_obj_as_string(val);
03797             }
03798             str_mod_check(str, sp, slen);
03799             if (val == dest) {  /* paranoid check [ruby-dev:24827] */
03800                 rb_raise(rb_eRuntimeError, "block should not cheat");
03801             }
03802         }
03803         else {
03804             val = rb_reg_regsub(repl, str, regs, pat);
03805         }
03806 
03807         if (OBJ_TAINTED(val)) tainted = 1;
03808 
03809         len = beg - offset;     /* copy pre-match substr */
03810         if (len) {
03811             rb_enc_str_buf_cat(dest, cp, len, str_enc);
03812         }
03813 
03814         rb_str_buf_append(dest, val);
03815 
03816         last = offset;
03817         offset = end0;
03818         if (beg0 == end0) {
03819             /*
03820              * Always consume at least one character of the input string
03821              * in order to prevent infinite loops.
03822              */
03823             if (RSTRING_LEN(str) <= end0) break;
03824             len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03825             rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03826             offset = end0 + len;
03827         }
03828         cp = RSTRING_PTR(str) + offset;
03829         if (offset > RSTRING_LEN(str)) break;
03830         beg = rb_reg_search(pat, str, offset, 0);
03831     } while (beg >= 0);
03832     if (RSTRING_LEN(str) > offset) {
03833         rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03834     }
03835     rb_reg_search(pat, str, last, 0);
03836     if (bang) {
03837         rb_str_shared_replace(str, dest);
03838     }
03839     else {
03840         RBASIC(dest)->klass = rb_obj_class(str);
03841         OBJ_INFECT(dest, str);
03842         str = dest;
03843     }
03844 
03845     if (tainted) OBJ_TAINT(str);
03846     return str;
03847 }
03848 
03849 
03850 /*
03851  *  call-seq:
03852  *     str.gsub!(pattern, replacement)        -> str or nil
03853  *     str.gsub!(pattern) {|match| block }    -> str or nil
03854  *     str.gsub!(pattern)                     -> an_enumerator
03855  *
03856  *  Performs the substitutions of <code>String#gsub</code> in place, returning
03857  *  <i>str</i>, or <code>nil</code> if no substitutions were performed.
03858  *  If no block and no <i>replacement</i> is given, an enumerator is returned instead.
03859  */
03860 
03861 static VALUE
03862 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03863 {
03864     str_modify_keep_cr(str);
03865     return str_gsub(argc, argv, str, 1);
03866 }
03867 
03868 
03869 /*
03870  *  call-seq:
03871  *     str.gsub(pattern, replacement)       -> new_str
03872  *     str.gsub(pattern, hash)              -> new_str
03873  *     str.gsub(pattern) {|match| block }   -> new_str
03874  *     str.gsub(pattern)                    -> enumerator
03875  *
03876  *  Returns a copy of <i>str</i> with the <em>all</em> occurrences of
03877  *  <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
03878  *  typically a <code>Regexp</code>; if given as a <code>String</code>, any
03879  *  regular expression metacharacters it contains will be interpreted
03880  *  literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd',
03881  *  instead of a digit.
03882  *
03883  *  If <i>replacement</i> is a <code>String</code> it will be substituted for
03884  *  the matched text. It may contain back-references to the pattern's capture
03885  *  groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
03886  *  <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
03887  *  double-quoted string, both back-references must be preceded by an
03888  *  additional backslash. However, within <i>replacement</i> the special match
03889  *  variables, such as <code>&$</code>, will not refer to the current match.
03890  *
03891  *  If the second argument is a <code>Hash</code>, and the matched text is one
03892  *  of its keys, the corresponding value is the replacement string.
03893  *
03894  *  In the block form, the current match string is passed in as a parameter,
03895  *  and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
03896  *  <code>$&</code>, and <code>$'</code> will be set appropriately. The value
03897  *  returned by the block will be substituted for the match on each call.
03898  *
03899  *  The result inherits any tainting in the original string or any supplied
03900  *  replacement string.
03901  *
03902  *  When neither a block nor a second argument is supplied, an
03903  *  <code>Enumerator</code> is returned.
03904  *
03905  *     "hello".gsub(/[aeiou]/, '*')                  #=> "h*ll*"
03906  *     "hello".gsub(/([aeiou])/, '<\1>')             #=> "h<e>ll<o>"
03907  *     "hello".gsub(/./) {|s| s.ord.to_s + ' '}      #=> "104 101 108 108 111 "
03908  *     "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}')  #=> "h{e}ll{o}"
03909  *     'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*')    #=> "h3ll*"
03910  */
03911 
03912 static VALUE
03913 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03914 {
03915     return str_gsub(argc, argv, str, 0);
03916 }
03917 
03918 
03919 /*
03920  *  call-seq:
03921  *     str.replace(other_str)   -> str
03922  *
03923  *  Replaces the contents and taintedness of <i>str</i> with the corresponding
03924  *  values in <i>other_str</i>.
03925  *
03926  *     s = "hello"         #=> "hello"
03927  *     s.replace "world"   #=> "world"
03928  */
03929 
03930 VALUE
03931 rb_str_replace(VALUE str, VALUE str2)
03932 {
03933     str_modifiable(str);
03934     if (str == str2) return str;
03935 
03936     StringValue(str2);
03937     str_discard(str);
03938     return str_replace(str, str2);
03939 }
03940 
03941 /*
03942  *  call-seq:
03943  *     string.clear    ->  string
03944  *
03945  *  Makes string empty.
03946  *
03947  *     a = "abcde"
03948  *     a.clear    #=> ""
03949  */
03950 
03951 static VALUE
03952 rb_str_clear(VALUE str)
03953 {
03954     str_discard(str);
03955     STR_SET_EMBED(str);
03956     STR_SET_EMBED_LEN(str, 0);
03957     RSTRING_PTR(str)[0] = 0;
03958     if (rb_enc_asciicompat(STR_ENC_GET(str)))
03959         ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03960     else
03961         ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03962     return str;
03963 }
03964 
03965 /*
03966  *  call-seq:
03967  *     string.chr    ->  string
03968  *
03969  *  Returns a one-character string at the beginning of the string.
03970  *
03971  *     a = "abcde"
03972  *     a.chr    #=> "a"
03973  */
03974 
03975 static VALUE
03976 rb_str_chr(VALUE str)
03977 {
03978     return rb_str_substr(str, 0, 1);
03979 }
03980 
03981 /*
03982  *  call-seq:
03983  *     str.getbyte(index)          -> 0 .. 255
03984  *
03985  *  returns the <i>index</i>th byte as an integer.
03986  */
03987 static VALUE
03988 rb_str_getbyte(VALUE str, VALUE index)
03989 {
03990     long pos = NUM2LONG(index);
03991 
03992     if (pos < 0)
03993         pos += RSTRING_LEN(str);
03994     if (pos < 0 ||  RSTRING_LEN(str) <= pos)
03995         return Qnil;
03996 
03997     return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03998 }
03999 
04000 /*
04001  *  call-seq:
04002  *     str.setbyte(index, int) -> int
04003  *
04004  *  modifies the <i>index</i>th byte as <i>int</i>.
04005  */
04006 static VALUE
04007 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
04008 {
04009     long pos = NUM2LONG(index);
04010     int byte = NUM2INT(value);
04011 
04012     rb_str_modify(str);
04013 
04014     if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04015         rb_raise(rb_eIndexError, "index %ld out of string", pos);
04016     if (pos < 0)
04017         pos += RSTRING_LEN(str);
04018 
04019     RSTRING_PTR(str)[pos] = byte;
04020 
04021     return value;
04022 }
04023 
04024 static VALUE
04025 str_byte_substr(VALUE str, long beg, long len)
04026 {
04027     char *p, *s = RSTRING_PTR(str);
04028     long n = RSTRING_LEN(str);
04029     VALUE str2;
04030 
04031     if (beg > n || len < 0) return Qnil;
04032     if (beg < 0) {
04033         beg += n;
04034         if (beg < 0) return Qnil;
04035     }
04036     if (beg + len > n)
04037         len = n - beg;
04038     if (len <= 0) {
04039         len = 0;
04040         p = 0;
04041     }
04042     else
04043         p = s + beg;
04044 
04045     if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04046         str2 = rb_str_new4(str);
04047         str2 = str_new3(rb_obj_class(str2), str2);
04048         RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04049         RSTRING(str2)->as.heap.len = len;
04050     }
04051     else {
04052         str2 = rb_str_new5(str, p, len);
04053     }
04054 
04055     str_enc_copy(str2, str);
04056 
04057     if (RSTRING_LEN(str2) == 0) {
04058         if (!rb_enc_asciicompat(STR_ENC_GET(str)))
04059             ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID);
04060         else
04061             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04062     }
04063     else {
04064         switch (ENC_CODERANGE(str)) {
04065           case ENC_CODERANGE_7BIT:
04066             ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT);
04067             break;
04068           default:
04069             ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN);
04070             break;
04071         }
04072     }
04073 
04074     OBJ_INFECT(str2, str);
04075 
04076     return str2;
04077 }
04078 
04079 static VALUE
04080 str_byte_aref(VALUE str, VALUE indx)
04081 {
04082     long idx;
04083     switch (TYPE(indx)) {
04084       case T_FIXNUM:
04085         idx = FIX2LONG(indx);
04086 
04087       num_index:
04088         str = str_byte_substr(str, idx, 1);
04089         if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04090         return str;
04091 
04092       default:
04093         /* check if indx is Range */
04094         {
04095             long beg, len = RSTRING_LEN(str);
04096 
04097             switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04098               case Qfalse:
04099                 break;
04100               case Qnil:
04101                 return Qnil;
04102               default:
04103                 return str_byte_substr(str, beg, len);
04104             }
04105         }
04106         idx = NUM2LONG(indx);
04107         goto num_index;
04108     }
04109     return Qnil;                /* not reached */
04110 }
04111 
04112 /*
04113  *  call-seq:
04114  *     str.byteslice(fixnum)           -> new_str or nil
04115  *     str.byteslice(fixnum, fixnum)   -> new_str or nil
04116  *     str.byteslice(range)            -> new_str or nil
04117  *
04118  *  Byte Reference---If passed a single <code>Fixnum</code>, returns a
04119  *  substring of one byte at that position. If passed two <code>Fixnum</code>
04120  *  objects, returns a substring starting at the offset given by the first, and
04121  *  a length given by the second. If given a <code>Range</code>, a substring containing
04122  *  bytes at offsets given by the range is returned. In all three cases, if
04123  *  an offset is negative, it is counted from the end of <i>str</i>. Returns
04124  *  <code>nil</code> if the initial offset falls outside the string, the length
04125  *  is negative, or the beginning of the range is greater than the end.
04126  *  The encoding of the resulted string keeps original encoding.
04127  *
04128  *     "hello".byteslice(1)     #=> "e"
04129  *     "hello".byteslice(-1)    #=> "o"
04130  *     "hello".byteslice(1, 2)  #=> "el"
04131  *     "\x80\u3042".byteslice(1, 3) #=> "\u3042"
04132  *     "\x03\u3042\xff".byteslice(1..3) #=> "\u3942"
04133  */
04134 
04135 static VALUE
04136 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04137 {
04138     if (argc == 2) {
04139         return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04140     }
04141     if (argc != 1) {
04142         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
04143     }
04144     return str_byte_aref(str, argv[0]);
04145 }
04146 
04147 /*
04148  *  call-seq:
04149  *     str.reverse   -> new_str
04150  *
04151  *  Returns a new string with the characters from <i>str</i> in reverse order.
04152  *
04153  *     "stressed".reverse   #=> "desserts"
04154  */
04155 
04156 static VALUE
04157 rb_str_reverse(VALUE str)
04158 {
04159     rb_encoding *enc;
04160     VALUE rev;
04161     char *s, *e, *p;
04162     int single = 1;
04163 
04164     if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04165     enc = STR_ENC_GET(str);
04166     rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04167     s = RSTRING_PTR(str); e = RSTRING_END(str);
04168     p = RSTRING_END(rev);
04169 
04170     if (RSTRING_LEN(str) > 1) {
04171         if (single_byte_optimizable(str)) {
04172             while (s < e) {
04173                 *--p = *s++;
04174             }
04175         }
04176         else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04177             while (s < e) {
04178                 int clen = rb_enc_fast_mbclen(s, e, enc);
04179 
04180                 if (clen > 1 || (*s & 0x80)) single = 0;
04181                 p -= clen;
04182                 memcpy(p, s, clen);
04183                 s += clen;
04184             }
04185         }
04186         else {
04187             while (s < e) {
04188                 int clen = rb_enc_mbclen(s, e, enc);
04189 
04190                 if (clen > 1 || (*s & 0x80)) single = 0;
04191                 p -= clen;
04192                 memcpy(p, s, clen);
04193                 s += clen;
04194             }
04195         }
04196     }
04197     STR_SET_LEN(rev, RSTRING_LEN(str));
04198     OBJ_INFECT(rev, str);
04199     if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04200         if (single) {
04201             ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04202         }
04203         else {
04204             ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04205         }
04206     }
04207     rb_enc_cr_str_copy_for_substr(rev, str);
04208 
04209     return rev;
04210 }
04211 
04212 
04213 /*
04214  *  call-seq:
04215  *     str.reverse!   -> str
04216  *
04217  *  Reverses <i>str</i> in place.
04218  */
04219 
04220 static VALUE
04221 rb_str_reverse_bang(VALUE str)
04222 {
04223     if (RSTRING_LEN(str) > 1) {
04224         if (single_byte_optimizable(str)) {
04225             char *s, *e, c;
04226 
04227             str_modify_keep_cr(str);
04228             s = RSTRING_PTR(str);
04229             e = RSTRING_END(str) - 1;
04230             while (s < e) {
04231                 c = *s;
04232                 *s++ = *e;
04233                 *e-- = c;
04234             }
04235         }
04236         else {
04237             rb_str_shared_replace(str, rb_str_reverse(str));
04238         }
04239     }
04240     else {
04241         str_modify_keep_cr(str);
04242     }
04243     return str;
04244 }
04245 
04246 
04247 /*
04248  *  call-seq:
04249  *     str.include? other_str   -> true or false
04250  *
04251  *  Returns <code>true</code> if <i>str</i> contains the given string or
04252  *  character.
04253  *
04254  *     "hello".include? "lo"   #=> true
04255  *     "hello".include? "ol"   #=> false
04256  *     "hello".include? ?h     #=> true
04257  */
04258 
04259 static VALUE
04260 rb_str_include(VALUE str, VALUE arg)
04261 {
04262     long i;
04263 
04264     StringValue(arg);
04265     i = rb_str_index(str, arg, 0);
04266 
04267     if (i == -1) return Qfalse;
04268     return Qtrue;
04269 }
04270 
04271 
04272 /*
04273  *  call-seq:
04274  *     str.to_i(base=10)   -> integer
04275  *
04276  *  Returns the result of interpreting leading characters in <i>str</i> as an
04277  *  integer base <i>base</i> (between 2 and 36). Extraneous characters past the
04278  *  end of a valid number are ignored. If there is not a valid number at the
04279  *  start of <i>str</i>, <code>0</code> is returned. This method never raises an
04280  *  exception when <i>base</i> is valid.
04281  *
04282  *     "12345".to_i             #=> 12345
04283  *     "99 red balloons".to_i   #=> 99
04284  *     "0a".to_i                #=> 0
04285  *     "0a".to_i(16)            #=> 10
04286  *     "hello".to_i             #=> 0
04287  *     "1100101".to_i(2)        #=> 101
04288  *     "1100101".to_i(8)        #=> 294977
04289  *     "1100101".to_i(10)       #=> 1100101
04290  *     "1100101".to_i(16)       #=> 17826049
04291  */
04292 
04293 static VALUE
04294 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04295 {
04296     int base;
04297 
04298     if (argc == 0) base = 10;
04299     else {
04300         VALUE b;
04301 
04302         rb_scan_args(argc, argv, "01", &b);
04303         base = NUM2INT(b);
04304     }
04305     if (base < 0) {
04306         rb_raise(rb_eArgError, "invalid radix %d", base);
04307     }
04308     return rb_str_to_inum(str, base, FALSE);
04309 }
04310 
04311 
04312 /*
04313  *  call-seq:
04314  *     str.to_f   -> float
04315  *
04316  *  Returns the result of interpreting leading characters in <i>str</i> as a
04317  *  floating point number. Extraneous characters past the end of a valid number
04318  *  are ignored. If there is not a valid number at the start of <i>str</i>,
04319  *  <code>0.0</code> is returned. This method never raises an exception.
04320  *
04321  *     "123.45e1".to_f        #=> 1234.5
04322  *     "45.67 degrees".to_f   #=> 45.67
04323  *     "thx1138".to_f         #=> 0.0
04324  */
04325 
04326 static VALUE
04327 rb_str_to_f(VALUE str)
04328 {
04329     return DBL2NUM(rb_str_to_dbl(str, FALSE));
04330 }
04331 
04332 
04333 /*
04334  *  call-seq:
04335  *     str.to_s     -> str
04336  *     str.to_str   -> str
04337  *
04338  *  Returns the receiver.
04339  */
04340 
04341 static VALUE
04342 rb_str_to_s(VALUE str)
04343 {
04344     if (rb_obj_class(str) != rb_cString) {
04345         return str_duplicate(rb_cString, str);
04346     }
04347     return str;
04348 }
04349 
04350 #if 0
04351 static void
04352 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04353 {
04354     char s[RUBY_MAX_CHAR_LEN];
04355     int n = rb_enc_codelen(c, enc);
04356 
04357     rb_enc_mbcput(c, s, enc);
04358     rb_enc_str_buf_cat(str, s, n, enc);
04359 }
04360 #endif
04361 
04362 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
04363 
04364 int
04365 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04366 {
04367     char buf[CHAR_ESC_LEN + 1];
04368     int l;
04369 
04370 #if SIZEOF_INT > 4
04371     c &= 0xffffffff;
04372 #endif
04373     if (unicode_p) {
04374         if (c < 0x7F && ISPRINT(c)) {
04375             snprintf(buf, CHAR_ESC_LEN, "%c", c);
04376         }
04377         else if (c < 0x10000) {
04378             snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04379         }
04380         else {
04381             snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04382         }
04383     }
04384     else {
04385         if (c < 0x100) {
04386             snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04387         }
04388         else {
04389             snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04390         }
04391     }
04392     l = (int)strlen(buf);       /* CHAR_ESC_LEN cannot exceed INT_MAX */
04393     rb_str_buf_cat(result, buf, l);
04394     return l;
04395 }
04396 
04397 /*
04398  * call-seq:
04399  *   str.inspect   -> string
04400  *
04401  * Returns a printable version of _str_, surrounded by quote marks,
04402  * with special characters escaped.
04403  *
04404  *    str = "hello"
04405  *    str[3] = "\b"
04406  *    str.inspect       #=> "\"hel\\bo\""
04407  */
04408 
04409 VALUE
04410 rb_str_inspect(VALUE str)
04411 {
04412     rb_encoding *enc = STR_ENC_GET(str);
04413     const char *p, *pend, *prev;
04414     char buf[CHAR_ESC_LEN + 1];
04415     VALUE result = rb_str_buf_new(0);
04416     rb_encoding *resenc = rb_default_internal_encoding();
04417     int unicode_p = rb_enc_unicode_p(enc);
04418     int asciicompat = rb_enc_asciicompat(enc);
04419     static rb_encoding *utf16, *utf32;
04420 
04421     if (!utf16) utf16 = rb_enc_find("UTF-16");
04422     if (!utf32) utf32 = rb_enc_find("UTF-32");
04423     if (resenc == NULL) resenc = rb_default_external_encoding();
04424     if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04425     rb_enc_associate(result, resenc);
04426     str_buf_cat2(result, "\"");
04427 
04428     p = RSTRING_PTR(str); pend = RSTRING_END(str);
04429     prev = p;
04430     if (enc == utf16) {
04431         const unsigned char *q = (const unsigned char *)p;
04432         if (q[0] == 0xFE && q[1] == 0xFF)
04433             enc = rb_enc_find("UTF-16BE");
04434         else if (q[0] == 0xFF && q[1] == 0xFE)
04435             enc = rb_enc_find("UTF-16LE");
04436         else
04437             unicode_p = 0;
04438     }
04439     else if (enc == utf32) {
04440         const unsigned char *q = (const unsigned char *)p;
04441         if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04442             enc = rb_enc_find("UTF-32BE");
04443         else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04444             enc = rb_enc_find("UTF-32LE");
04445         else
04446             unicode_p = 0;
04447     }
04448     while (p < pend) {
04449         unsigned int c, cc;
04450         int n;
04451 
04452         n = rb_enc_precise_mbclen(p, pend, enc);
04453         if (!MBCLEN_CHARFOUND_P(n)) {
04454             if (p > prev) str_buf_cat(result, prev, p - prev);
04455             n = rb_enc_mbminlen(enc);
04456             if (pend < p + n)
04457                 n = (int)(pend - p);
04458             while (n--) {
04459                 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04460                 str_buf_cat(result, buf, strlen(buf));
04461                 prev = ++p;
04462             }
04463             continue;
04464         }
04465         n = MBCLEN_CHARFOUND_LEN(n);
04466         c = rb_enc_mbc_to_codepoint(p, pend, enc);
04467         p += n;
04468         if ((asciicompat || unicode_p) &&
04469           (c == '"'|| c == '\\' ||
04470             (c == '#' &&
04471              p < pend &&
04472              MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04473              (cc = rb_enc_codepoint(p,pend,enc),
04474               (cc == '$' || cc == '@' || cc == '{'))))) {
04475             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04476             str_buf_cat2(result, "\\");
04477             if (asciicompat || enc == resenc) {
04478                 prev = p - n;
04479                 continue;
04480             }
04481         }
04482         switch (c) {
04483           case '\n': cc = 'n'; break;
04484           case '\r': cc = 'r'; break;
04485           case '\t': cc = 't'; break;
04486           case '\f': cc = 'f'; break;
04487           case '\013': cc = 'v'; break;
04488           case '\010': cc = 'b'; break;
04489           case '\007': cc = 'a'; break;
04490           case 033: cc = 'e'; break;
04491           default: cc = 0; break;
04492         }
04493         if (cc) {
04494             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04495             buf[0] = '\\';
04496             buf[1] = (char)cc;
04497             str_buf_cat(result, buf, 2);
04498             prev = p;
04499             continue;
04500         }
04501         if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04502             (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04503             continue;
04504         }
04505         else {
04506             if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04507             rb_str_buf_cat_escaped_char(result, c, unicode_p);
04508             prev = p;
04509             continue;
04510         }
04511     }
04512     if (p > prev) str_buf_cat(result, prev, p - prev);
04513     str_buf_cat2(result, "\"");
04514 
04515     OBJ_INFECT(result, str);
04516     return result;
04517 }
04518 
04519 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04520 
04521 /*
04522  *  call-seq:
04523  *     str.dump   -> new_str
04524  *
04525  *  Produces a version of <i>str</i> with all nonprinting characters replaced by
04526  *  <code>\nnn</code> notation and all special characters escaped.
04527  */
04528 
04529 VALUE
04530 rb_str_dump(VALUE str)
04531 {
04532     rb_encoding *enc = rb_enc_get(str);
04533     long len;
04534     const char *p, *pend;
04535     char *q, *qend;
04536     VALUE result;
04537     int u8 = (enc == rb_utf8_encoding());
04538 
04539     len = 2;                    /* "" */
04540     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04541     while (p < pend) {
04542         unsigned char c = *p++;
04543         switch (c) {
04544           case '"':  case '\\':
04545           case '\n': case '\r':
04546           case '\t': case '\f':
04547           case '\013': case '\010': case '\007': case '\033':
04548             len += 2;
04549             break;
04550 
04551           case '#':
04552             len += IS_EVSTR(p, pend) ? 2 : 1;
04553             break;
04554 
04555           default:
04556             if (ISPRINT(c)) {
04557                 len++;
04558             }
04559             else {
04560                 if (u8) {       /* \u{NN} */
04561                     int n = rb_enc_precise_mbclen(p-1, pend, enc);
04562                     if (MBCLEN_CHARFOUND_P(n-1)) {
04563                         unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04564                         while (cc >>= 4) len++;
04565                         len += 5;
04566                         p += MBCLEN_CHARFOUND_LEN(n)-1;
04567                         break;
04568                     }
04569                 }
04570                 len += 4;       /* \xNN */
04571             }
04572             break;
04573         }
04574     }
04575     if (!rb_enc_asciicompat(enc)) {
04576         len += 19;              /* ".force_encoding('')" */
04577         len += strlen(enc->name);
04578     }
04579 
04580     result = rb_str_new5(str, 0, len);
04581     p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04582     q = RSTRING_PTR(result); qend = q + len + 1;
04583 
04584     *q++ = '"';
04585     while (p < pend) {
04586         unsigned char c = *p++;
04587 
04588         if (c == '"' || c == '\\') {
04589             *q++ = '\\';
04590             *q++ = c;
04591         }
04592         else if (c == '#') {
04593             if (IS_EVSTR(p, pend)) *q++ = '\\';
04594             *q++ = '#';
04595         }
04596         else if (c == '\n') {
04597             *q++ = '\\';
04598             *q++ = 'n';
04599         }
04600         else if (c == '\r') {
04601             *q++ = '\\';
04602             *q++ = 'r';
04603         }
04604         else if (c == '\t') {
04605             *q++ = '\\';
04606             *q++ = 't';
04607         }
04608         else if (c == '\f') {
04609             *q++ = '\\';
04610             *q++ = 'f';
04611         }
04612         else if (c == '\013') {
04613             *q++ = '\\';
04614             *q++ = 'v';
04615         }
04616         else if (c == '\010') {
04617             *q++ = '\\';
04618             *q++ = 'b';
04619         }
04620         else if (c == '\007') {
04621             *q++ = '\\';
04622             *q++ = 'a';
04623         }
04624         else if (c == '\033') {
04625             *q++ = '\\';
04626             *q++ = 'e';
04627         }
04628         else if (ISPRINT(c)) {
04629             *q++ = c;
04630         }
04631         else {
04632             *q++ = '\\';
04633             if (u8) {
04634                 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04635                 if (MBCLEN_CHARFOUND_P(n)) {
04636                     int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04637                     p += n;
04638                     snprintf(q, qend-q, "u{%x}", cc);
04639                     q += strlen(q);
04640                     continue;
04641                 }
04642             }
04643             snprintf(q, qend-q, "x%02X", c);
04644             q += 3;
04645         }
04646     }
04647     *q++ = '"';
04648     *q = '\0';
04649     if (!rb_enc_asciicompat(enc)) {
04650         snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04651         enc = rb_ascii8bit_encoding();
04652     }
04653     OBJ_INFECT(result, str);
04654     /* result from dump is ASCII */
04655     rb_enc_associate(result, enc);
04656     ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04657     return result;
04658 }
04659 
04660 
04661 static void
04662 rb_str_check_dummy_enc(rb_encoding *enc)
04663 {
04664     if (rb_enc_dummy_p(enc)) {
04665         rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04666                  rb_enc_name(enc));
04667     }
04668 }
04669 
04670 /*
04671  *  call-seq:
04672  *     str.upcase!   -> str or nil
04673  *
04674  *  Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
04675  *  were made.
04676  *  Note: case replacement is effective only in ASCII region.
04677  */
04678 
04679 static VALUE
04680 rb_str_upcase_bang(VALUE str)
04681 {
04682     rb_encoding *enc;
04683     char *s, *send;
04684     int modify = 0;
04685     int n;
04686 
04687     str_modify_keep_cr(str);
04688     enc = STR_ENC_GET(str);
04689     rb_str_check_dummy_enc(enc);
04690     s = RSTRING_PTR(str); send = RSTRING_END(str);
04691     if (single_byte_optimizable(str)) {
04692         while (s < send) {
04693             unsigned int c = *(unsigned char*)s;
04694 
04695             if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04696                 *s = 'A' + (c - 'a');
04697                 modify = 1;
04698             }
04699             s++;
04700         }
04701     }
04702     else {
04703         int ascompat = rb_enc_asciicompat(enc);
04704 
04705         while (s < send) {
04706             unsigned int c;
04707 
04708             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04709                 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04710                     *s = 'A' + (c - 'a');
04711                     modify = 1;
04712                 }
04713                 s++;
04714             }
04715             else {
04716                 c = rb_enc_codepoint_len(s, send, &n, enc);
04717                 if (rb_enc_islower(c, enc)) {
04718                     /* assuming toupper returns codepoint with same size */
04719                     rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04720                     modify = 1;
04721                 }
04722                 s += n;
04723             }
04724         }
04725     }
04726 
04727     if (modify) return str;
04728     return Qnil;
04729 }
04730 
04731 
04732 /*
04733  *  call-seq:
04734  *     str.upcase   -> new_str
04735  *
04736  *  Returns a copy of <i>str</i> with all lowercase letters replaced with their
04737  *  uppercase counterparts. The operation is locale insensitive---only
04738  *  characters ``a'' to ``z'' are affected.
04739  *  Note: case replacement is effective only in ASCII region.
04740  *
04741  *     "hEllO".upcase   #=> "HELLO"
04742  */
04743 
04744 static VALUE
04745 rb_str_upcase(VALUE str)
04746 {
04747     str = rb_str_dup(str);
04748     rb_str_upcase_bang(str);
04749     return str;
04750 }
04751 
04752 
04753 /*
04754  *  call-seq:
04755  *     str.downcase!   -> str or nil
04756  *
04757  *  Downcases the contents of <i>str</i>, returning <code>nil</code> if no
04758  *  changes were made.
04759  *  Note: case replacement is effective only in ASCII region.
04760  */
04761 
04762 static VALUE
04763 rb_str_downcase_bang(VALUE str)
04764 {
04765     rb_encoding *enc;
04766     char *s, *send;
04767     int modify = 0;
04768 
04769     str_modify_keep_cr(str);
04770     enc = STR_ENC_GET(str);
04771     rb_str_check_dummy_enc(enc);
04772     s = RSTRING_PTR(str); send = RSTRING_END(str);
04773     if (single_byte_optimizable(str)) {
04774         while (s < send) {
04775             unsigned int c = *(unsigned char*)s;
04776 
04777             if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04778                 *s = 'a' + (c - 'A');
04779                 modify = 1;
04780             }
04781             s++;
04782         }
04783     }
04784     else {
04785         int ascompat = rb_enc_asciicompat(enc);
04786 
04787         while (s < send) {
04788             unsigned int c;
04789             int n;
04790 
04791             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04792                 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04793                     *s = 'a' + (c - 'A');
04794                     modify = 1;
04795                 }
04796                 s++;
04797             }
04798             else {
04799                 c = rb_enc_codepoint_len(s, send, &n, enc);
04800                 if (rb_enc_isupper(c, enc)) {
04801                     /* assuming toupper returns codepoint with same size */
04802                     rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04803                     modify = 1;
04804                 }
04805                 s += n;
04806             }
04807         }
04808     }
04809 
04810     if (modify) return str;
04811     return Qnil;
04812 }
04813 
04814 
04815 /*
04816  *  call-seq:
04817  *     str.downcase   -> new_str
04818  *
04819  *  Returns a copy of <i>str</i> with all uppercase letters replaced with their
04820  *  lowercase counterparts. The operation is locale insensitive---only
04821  *  characters ``A'' to ``Z'' are affected.
04822  *  Note: case replacement is effective only in ASCII region.
04823  *
04824  *     "hEllO".downcase   #=> "hello"
04825  */
04826 
04827 static VALUE
04828 rb_str_downcase(VALUE str)
04829 {
04830     str = rb_str_dup(str);
04831     rb_str_downcase_bang(str);
04832     return str;
04833 }
04834 
04835 
04836 /*
04837  *  call-seq:
04838  *     str.capitalize!   -> str or nil
04839  *
04840  *  Modifies <i>str</i> by converting the first character to uppercase and the
04841  *  remainder to lowercase. Returns <code>nil</code> if no changes are made.
04842  *  Note: case conversion is effective only in ASCII region.
04843  *
04844  *     a = "hello"
04845  *     a.capitalize!   #=> "Hello"
04846  *     a               #=> "Hello"
04847  *     a.capitalize!   #=> nil
04848  */
04849 
04850 static VALUE
04851 rb_str_capitalize_bang(VALUE str)
04852 {
04853     rb_encoding *enc;
04854     char *s, *send;
04855     int modify = 0;
04856     unsigned int c;
04857     int n;
04858 
04859     str_modify_keep_cr(str);
04860     enc = STR_ENC_GET(str);
04861     rb_str_check_dummy_enc(enc);
04862     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04863     s = RSTRING_PTR(str); send = RSTRING_END(str);
04864 
04865     c = rb_enc_codepoint_len(s, send, &n, enc);
04866     if (rb_enc_islower(c, enc)) {
04867         rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04868         modify = 1;
04869     }
04870     s += n;
04871     while (s < send) {
04872         c = rb_enc_codepoint_len(s, send, &n, enc);
04873         if (rb_enc_isupper(c, enc)) {
04874             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04875             modify = 1;
04876         }
04877         s += n;
04878     }
04879 
04880     if (modify) return str;
04881     return Qnil;
04882 }
04883 
04884 
04885 /*
04886  *  call-seq:
04887  *     str.capitalize   -> new_str
04888  *
04889  *  Returns a copy of <i>str</i> with the first character converted to uppercase
04890  *  and the remainder to lowercase.
04891  *  Note: case conversion is effective only in ASCII region.
04892  *
04893  *     "hello".capitalize    #=> "Hello"
04894  *     "HELLO".capitalize    #=> "Hello"
04895  *     "123ABC".capitalize   #=> "123abc"
04896  */
04897 
04898 static VALUE
04899 rb_str_capitalize(VALUE str)
04900 {
04901     str = rb_str_dup(str);
04902     rb_str_capitalize_bang(str);
04903     return str;
04904 }
04905 
04906 
04907 /*
04908  *  call-seq:
04909  *     str.swapcase!   -> str or nil
04910  *
04911  *  Equivalent to <code>String#swapcase</code>, but modifies the receiver in
04912  *  place, returning <i>str</i>, or <code>nil</code> if no changes were made.
04913  *  Note: case conversion is effective only in ASCII region.
04914  */
04915 
04916 static VALUE
04917 rb_str_swapcase_bang(VALUE str)
04918 {
04919     rb_encoding *enc;
04920     char *s, *send;
04921     int modify = 0;
04922     int n;
04923 
04924     str_modify_keep_cr(str);
04925     enc = STR_ENC_GET(str);
04926     rb_str_check_dummy_enc(enc);
04927     s = RSTRING_PTR(str); send = RSTRING_END(str);
04928     while (s < send) {
04929         unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04930 
04931         if (rb_enc_isupper(c, enc)) {
04932             /* assuming toupper returns codepoint with same size */
04933             rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04934             modify = 1;
04935         }
04936         else if (rb_enc_islower(c, enc)) {
04937             /* assuming tolower returns codepoint with same size */
04938             rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04939             modify = 1;
04940         }
04941         s += n;
04942     }
04943 
04944     if (modify) return str;
04945     return Qnil;
04946 }
04947 
04948 
04949 /*
04950  *  call-seq:
04951  *     str.swapcase   -> new_str
04952  *
04953  *  Returns a copy of <i>str</i> with uppercase alphabetic characters converted
04954  *  to lowercase and lowercase characters converted to uppercase.
04955  *  Note: case conversion is effective only in ASCII region.
04956  *
04957  *     "Hello".swapcase          #=> "hELLO"
04958  *     "cYbEr_PuNk11".swapcase   #=> "CyBeR_pUnK11"
04959  */
04960 
04961 static VALUE
04962 rb_str_swapcase(VALUE str)
04963 {
04964     str = rb_str_dup(str);
04965     rb_str_swapcase_bang(str);
04966     return str;
04967 }
04968 
04969 typedef unsigned char *USTR;
04970 
04971 struct tr {
04972     int gen;
04973     unsigned int now, max;
04974     char *p, *pend;
04975 };
04976 
04977 static unsigned int
04978 trnext(struct tr *t, rb_encoding *enc)
04979 {
04980     int n;
04981 
04982     for (;;) {
04983         if (!t->gen) {
04984             if (t->p == t->pend) return -1;
04985             if (t->p < t->pend - 1 && *t->p == '\\') {
04986                 t->p++;
04987             }
04988             t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04989             t->p += n;
04990             if (t->p < t->pend - 1 && *t->p == '-') {
04991                 t->p++;
04992                 if (t->p < t->pend) {
04993                     unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04994                     t->p += n;
04995                     if (t->now > c) {
04996                         if (t->now < 0x80 && c < 0x80) {
04997                             rb_raise(rb_eArgError,
04998                                      "invalid range \"%c-%c\" in string transliteration",
04999                                      t->now, c);
05000                         }
05001                         else {
05002                             rb_raise(rb_eArgError, "invalid range in string transliteration");
05003                         }
05004                         continue; /* not reached */
05005                     }
05006                     t->gen = 1;
05007                     t->max = c;
05008                 }
05009             }
05010             return t->now;
05011         }
05012         else if (++t->now < t->max) {
05013             return t->now;
05014         }
05015         else {
05016             t->gen = 0;
05017             return t->max;
05018         }
05019     }
05020 }
05021 
05022 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
05023 
05024 static VALUE
05025 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
05026 {
05027     const unsigned int errc = -1;
05028     unsigned int trans[256];
05029     rb_encoding *enc, *e1, *e2;
05030     struct tr trsrc, trrepl;
05031     int cflag = 0;
05032     unsigned int c, c0, last = 0;
05033     int modify = 0, i, l;
05034     char *s, *send;
05035     VALUE hash = 0;
05036     int singlebyte = single_byte_optimizable(str);
05037     int cr;
05038 
05039 #define CHECK_IF_ASCII(c) \
05040     (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05041            (cr = ENC_CODERANGE_VALID) : 0)
05042 
05043     StringValue(src);
05044     StringValue(repl);
05045     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05046     if (RSTRING_LEN(repl) == 0) {
05047         return rb_str_delete_bang(1, &src, str);
05048     }
05049 
05050     cr = ENC_CODERANGE(str);
05051     e1 = rb_enc_check(str, src);
05052     e2 = rb_enc_check(str, repl);
05053     if (e1 == e2) {
05054         enc = e1;
05055     }
05056     else {
05057         enc = rb_enc_check(src, repl);
05058     }
05059     trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05060     if (RSTRING_LEN(src) > 1 &&
05061         rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05062         trsrc.p + l < trsrc.pend) {
05063         cflag = 1;
05064         trsrc.p += l;
05065     }
05066     trrepl.p = RSTRING_PTR(repl);
05067     trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05068     trsrc.gen = trrepl.gen = 0;
05069     trsrc.now = trrepl.now = 0;
05070     trsrc.max = trrepl.max = 0;
05071 
05072     if (cflag) {
05073         for (i=0; i<256; i++) {
05074             trans[i] = 1;
05075         }
05076         while ((c = trnext(&trsrc, enc)) != errc) {
05077             if (c < 256) {
05078                 trans[c] = errc;
05079             }
05080             else {
05081                 if (!hash) hash = rb_hash_new();
05082                 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05083             }
05084         }
05085         while ((c = trnext(&trrepl, enc)) != errc)
05086             /* retrieve last replacer */;
05087         last = trrepl.now;
05088         for (i=0; i<256; i++) {
05089             if (trans[i] != errc) {
05090                 trans[i] = last;
05091             }
05092         }
05093     }
05094     else {
05095         unsigned int r;
05096 
05097         for (i=0; i<256; i++) {
05098             trans[i] = errc;
05099         }
05100         while ((c = trnext(&trsrc, enc)) != errc) {
05101             r = trnext(&trrepl, enc);
05102             if (r == errc) r = trrepl.now;
05103             if (c < 256) {
05104                 trans[c] = r;
05105                 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05106             }
05107             else {
05108                 if (!hash) hash = rb_hash_new();
05109                 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05110             }
05111         }
05112     }
05113 
05114     if (cr == ENC_CODERANGE_VALID)
05115         cr = ENC_CODERANGE_7BIT;
05116     str_modify_keep_cr(str);
05117     s = RSTRING_PTR(str); send = RSTRING_END(str);
05118     if (sflag) {
05119         int clen, tlen;
05120         long offset, max = RSTRING_LEN(str);
05121         unsigned int save = -1;
05122         char *buf = ALLOC_N(char, max), *t = buf;
05123 
05124         while (s < send) {
05125             int may_modify = 0;
05126 
05127             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05128             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05129 
05130             s += clen;
05131             if (c < 256) {
05132                 c = trans[c];
05133             }
05134             else if (hash) {
05135                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05136                 if (NIL_P(tmp)) {
05137                     if (cflag) c = last;
05138                     else c = errc;
05139                 }
05140                 else if (cflag) c = errc;
05141                 else c = NUM2INT(tmp);
05142             }
05143             else {
05144                 c = errc;
05145             }
05146             if (c != (unsigned int)-1) {
05147                 if (save == c) {
05148                     CHECK_IF_ASCII(c);
05149                     continue;
05150                 }
05151                 save = c;
05152                 tlen = rb_enc_codelen(c, enc);
05153                 modify = 1;
05154             }
05155             else {
05156                 save = -1;
05157                 c = c0;
05158                 if (enc != e1) may_modify = 1;
05159             }
05160             while (t - buf + tlen >= max) {
05161                 offset = t - buf;
05162                 max *= 2;
05163                 REALLOC_N(buf, char, max);
05164                 t = buf + offset;
05165             }
05166             rb_enc_mbcput(c, t, enc);
05167             if (may_modify && memcmp(s, t, tlen) != 0) {
05168                 modify = 1;
05169             }
05170             CHECK_IF_ASCII(c);
05171             t += tlen;
05172         }
05173         if (!STR_EMBED_P(str)) {
05174             xfree(RSTRING(str)->as.heap.ptr);
05175         }
05176         *t = '\0';
05177         RSTRING(str)->as.heap.ptr = buf;
05178         RSTRING(str)->as.heap.len = t - buf;
05179         STR_SET_NOEMBED(str);
05180         RSTRING(str)->as.heap.aux.capa = max;
05181     }
05182     else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05183         while (s < send) {
05184             c = (unsigned char)*s;
05185             if (trans[c] != errc) {
05186                 if (!cflag) {
05187                     c = trans[c];
05188                     *s = c;
05189                     modify = 1;
05190                 }
05191                 else {
05192                     *s = last;
05193                     modify = 1;
05194                 }
05195             }
05196             CHECK_IF_ASCII(c);
05197             s++;
05198         }
05199     }
05200     else {
05201         int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05202         long offset;
05203         char *buf = ALLOC_N(char, max), *t = buf;
05204 
05205         while (s < send) {
05206             int may_modify = 0;
05207             c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05208             tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05209 
05210             if (c < 256) {
05211                 c = trans[c];
05212             }
05213             else if (hash) {
05214                 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05215                 if (NIL_P(tmp)) {
05216                     if (cflag) c = last;
05217                     else c = errc;
05218                 }
05219                 else if (cflag) c = errc;
05220                 else c = NUM2INT(tmp);
05221             }
05222             else {
05223                 c = cflag ? last : errc;
05224             }
05225             if (c != errc) {
05226                 tlen = rb_enc_codelen(c, enc);
05227                 modify = 1;
05228             }
05229             else {
05230                 c = c0;
05231                 if (enc != e1) may_modify = 1;
05232             }
05233             while (t - buf + tlen >= max) {
05234                 offset = t - buf;
05235                 max *= 2;
05236                 REALLOC_N(buf, char, max);
05237                 t = buf + offset;
05238             }
05239             if (s != t) {
05240                 rb_enc_mbcput(c, t, enc);
05241                 if (may_modify && memcmp(s, t, tlen) != 0) {
05242                     modify = 1;
05243                 }
05244             }
05245             CHECK_IF_ASCII(c);
05246             s += clen;
05247             t += tlen;
05248         }
05249         if (!STR_EMBED_P(str)) {
05250             xfree(RSTRING(str)->as.heap.ptr);
05251         }
05252         *t = '\0';
05253         RSTRING(str)->as.heap.ptr = buf;
05254         RSTRING(str)->as.heap.len = t - buf;
05255         STR_SET_NOEMBED(str);
05256         RSTRING(str)->as.heap.aux.capa = max;
05257     }
05258 
05259     if (modify) {
05260         if (cr != ENC_CODERANGE_BROKEN)
05261             ENC_CODERANGE_SET(str, cr);
05262         rb_enc_associate(str, enc);
05263         return str;
05264     }
05265     return Qnil;
05266 }
05267 
05268 
05269 /*
05270  *  call-seq:
05271  *     str.tr!(from_str, to_str)   -> str or nil
05272  *
05273  *  Translates <i>str</i> in place, using the same rules as
05274  *  <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
05275  *  changes were made.
05276  */
05277 
05278 static VALUE
05279 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05280 {
05281     return tr_trans(str, src, repl, 0);
05282 }
05283 
05284 
05285 /*
05286  *  call-seq:
05287  *     str.tr(from_str, to_str)   => new_str
05288  *
05289  *  Returns a copy of <i>str</i> with the characters in <i>from_str</i>
05290  *  replaced by the corresponding characters in <i>to_str</i>. If
05291  *  <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last
05292  *  character in order to maintain the correspondence.
05293  *
05294  *     "hello".tr('el', 'ip')      #=> "hippo"
05295  *     "hello".tr('aeiou', '*')    #=> "h*ll*"
05296  *
05297  *  Both strings may use the c1-c2 notation to denote ranges of characters,
05298  *  and <i>from_str</i> may start with a <code>^</code>, which denotes all
05299  *  characters except those listed.
05300  *
05301  *     "hello".tr('a-y', 'b-z')    #=> "ifmmp"
05302  *     "hello".tr('^aeiou', '*')   #=> "*e**o"
05303  */
05304 
05305 static VALUE
05306 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05307 {
05308     str = rb_str_dup(str);
05309     tr_trans(str, src, repl, 0);
05310     return str;
05311 }
05312 
05313 #define TR_TABLE_SIZE 257
05314 static void
05315 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05316                VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05317 {
05318     const unsigned int errc = -1;
05319     char buf[256];
05320     struct tr tr;
05321     unsigned int c;
05322     VALUE table = 0, ptable = 0;
05323     int i, l, cflag = 0;
05324 
05325     tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05326     tr.gen = tr.now = tr.max = 0;
05327 
05328     if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05329         cflag = 1;
05330         tr.p += l;
05331     }
05332     if (first) {
05333         for (i=0; i<256; i++) {
05334             stable[i] = 1;
05335         }
05336         stable[256] = cflag;
05337     }
05338     else if (stable[256] && !cflag) {
05339         stable[256] = 0;
05340     }
05341     for (i=0; i<256; i++) {
05342         buf[i] = cflag;
05343     }
05344 
05345     while ((c = trnext(&tr, enc)) != errc) {
05346         if (c < 256) {
05347             buf[c & 0xff] = !cflag;
05348         }
05349         else {
05350             VALUE key = UINT2NUM(c);
05351 
05352             if (!table) {
05353                 table = rb_hash_new();
05354                 if (cflag) {
05355                     ptable = *ctablep;
05356                     *ctablep = table;
05357                 }
05358                 else {
05359                     ptable = *tablep;
05360                     *tablep = table;
05361                 }
05362             }
05363             if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05364                 rb_hash_aset(table, key, Qtrue);
05365             }
05366         }
05367     }
05368     for (i=0; i<256; i++) {
05369         stable[i] = stable[i] && buf[i];
05370     }
05371 }
05372 
05373 
05374 static int
05375 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05376 {
05377     if (c < 256) {
05378         return table[c] != 0;
05379     }
05380     else {
05381         VALUE v = UINT2NUM(c);
05382 
05383         if (del) {
05384             if (!NIL_P(rb_hash_lookup(del, v)) &&
05385                     (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05386                 return TRUE;
05387             }
05388         }
05389         else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05390             return FALSE;
05391         }
05392         return table[256] ? TRUE : FALSE;
05393     }
05394 }
05395 
05396 /*
05397  *  call-seq:
05398  *     str.delete!([other_str]+)   -> str or nil
05399  *
05400  *  Performs a <code>delete</code> operation in place, returning <i>str</i>, or
05401  *  <code>nil</code> if <i>str</i> was not modified.
05402  */
05403 
05404 static VALUE
05405 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05406 {
05407     char squeez[TR_TABLE_SIZE];
05408     rb_encoding *enc = 0;
05409     char *s, *send, *t;
05410     VALUE del = 0, nodel = 0;
05411     int modify = 0;
05412     int i, ascompat, cr;
05413 
05414     if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05415     if (argc < 1) {
05416         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05417     }
05418     for (i=0; i<argc; i++) {
05419         VALUE s = argv[i];
05420 
05421         StringValue(s);
05422         enc = rb_enc_check(str, s);
05423         tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05424     }
05425 
05426     str_modify_keep_cr(str);
05427     ascompat = rb_enc_asciicompat(enc);
05428     s = t = RSTRING_PTR(str);
05429     send = RSTRING_END(str);
05430     cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05431     while (s < send) {
05432         unsigned int c;
05433         int clen;
05434 
05435         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05436             if (squeez[c]) {
05437                 modify = 1;
05438             }
05439             else {
05440                 if (t != s) *t = c;
05441                 t++;
05442             }
05443             s++;
05444         }
05445         else {
05446             c = rb_enc_codepoint_len(s, send, &clen, enc);
05447 
05448             if (tr_find(c, squeez, del, nodel)) {
05449                 modify = 1;
05450             }
05451             else {
05452                 if (t != s) rb_enc_mbcput(c, t, enc);
05453                 t += clen;
05454                 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05455             }
05456             s += clen;
05457         }
05458     }
05459     *t = '\0';
05460     STR_SET_LEN(str, t - RSTRING_PTR(str));
05461     ENC_CODERANGE_SET(str, cr);
05462 
05463     if (modify) return str;
05464     return Qnil;
05465 }
05466 
05467 
05468 /*
05469  *  call-seq:
05470  *     str.delete([other_str]+)   -> new_str
05471  *
05472  *  Returns a copy of <i>str</i> with all characters in the intersection of its
05473  *  arguments deleted. Uses the same rules for building the set of characters as
05474  *  <code>String#count</code>.
05475  *
05476  *     "hello".delete "l","lo"        #=> "heo"
05477  *     "hello".delete "lo"            #=> "he"
05478  *     "hello".delete "aeiou", "^e"   #=> "hell"
05479  *     "hello".delete "ej-m"          #=> "ho"
05480  */
05481 
05482 static VALUE
05483 rb_str_delete(int argc, VALUE *argv, VALUE str)
05484 {
05485     str = rb_str_dup(str);
05486     rb_str_delete_bang(argc, argv, str);
05487     return str;
05488 }
05489 
05490 
05491 /*
05492  *  call-seq:
05493  *     str.squeeze!([other_str]*)   -> str or nil
05494  *
05495  *  Squeezes <i>str</i> in place, returning either <i>str</i>, or
05496  *  <code>nil</code> if no changes were made.
05497  */
05498 
05499 static VALUE
05500 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05501 {
05502     char squeez[TR_TABLE_SIZE];
05503     rb_encoding *enc = 0;
05504     VALUE del = 0, nodel = 0;
05505     char *s, *send, *t;
05506     int i, modify = 0;
05507     int ascompat, singlebyte = single_byte_optimizable(str);
05508     unsigned int save;
05509 
05510     if (argc == 0) {
05511         enc = STR_ENC_GET(str);
05512     }
05513     else {
05514         for (i=0; i<argc; i++) {
05515             VALUE s = argv[i];
05516 
05517             StringValue(s);
05518             enc = rb_enc_check(str, s);
05519             if (singlebyte && !single_byte_optimizable(s))
05520                 singlebyte = 0;
05521             tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05522         }
05523     }
05524 
05525     str_modify_keep_cr(str);
05526     s = t = RSTRING_PTR(str);
05527     if (!s || RSTRING_LEN(str) == 0) return Qnil;
05528     send = RSTRING_END(str);
05529     save = -1;
05530     ascompat = rb_enc_asciicompat(enc);
05531 
05532     if (singlebyte) {
05533         while (s < send) {
05534             unsigned int c = *(unsigned char*)s++;
05535             if (c != save || (argc > 0 && !squeez[c])) {
05536                 *t++ = save = c;
05537             }
05538         }
05539     } else {
05540         while (s < send) {
05541             unsigned int c;
05542             int clen;
05543 
05544             if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05545                 if (c != save || (argc > 0 && !squeez[c])) {
05546                     *t++ = save = c;
05547                 }
05548                 s++;
05549             }
05550             else {
05551                 c = rb_enc_codepoint_len(s, send, &clen, enc);
05552 
05553                 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05554                     if (t != s) rb_enc_mbcput(c, t, enc);
05555                     save = c;
05556                     t += clen;
05557                 }
05558                 s += clen;
05559             }
05560         }
05561     }
05562 
05563     *t = '\0';
05564     if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05565         STR_SET_LEN(str, t - RSTRING_PTR(str));
05566         modify = 1;
05567     }
05568 
05569     if (modify) return str;
05570     return Qnil;
05571 }
05572 
05573 
05574 /*
05575  *  call-seq:
05576  *     str.squeeze([other_str]*)    -> new_str
05577  *
05578  *  Builds a set of characters from the <i>other_str</i> parameter(s) using the
05579  *  procedure described for <code>String#count</code>. Returns a new string
05580  *  where runs of the same character that occur in this set are replaced by a
05581  *  single character. If no arguments are given, all runs of identical
05582  *  characters are replaced by a single character.
05583  *
05584  *     "yellow moon".squeeze                  #=> "yelow mon"
05585  *     "  now   is  the".squeeze(" ")         #=> " now is the"
05586  *     "putters shoot balls".squeeze("m-z")   #=> "puters shot balls"
05587  */
05588 
05589 static VALUE
05590 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05591 {
05592     str = rb_str_dup(str);
05593     rb_str_squeeze_bang(argc, argv, str);
05594     return str;
05595 }
05596 
05597 
05598 /*
05599  *  call-seq:
05600  *     str.tr_s!(from_str, to_str)   -> str or nil
05601  *
05602  *  Performs <code>String#tr_s</code> processing on <i>str</i> in place,
05603  *  returning <i>str</i>, or <code>nil</code> if no changes were made.
05604  */
05605 
05606 static VALUE
05607 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05608 {
05609     return tr_trans(str, src, repl, 1);
05610 }
05611 
05612 
05613 /*
05614  *  call-seq:
05615  *     str.tr_s(from_str, to_str)   -> new_str
05616  *
05617  *  Processes a copy of <i>str</i> as described under <code>String#tr</code>,
05618  *  then removes duplicate characters in regions that were affected by the
05619  *  translation.
05620  *
05621  *     "hello".tr_s('l', 'r')     #=> "hero"
05622  *     "hello".tr_s('el', '*')    #=> "h*o"
05623  *     "hello".tr_s('el', 'hx')   #=> "hhxo"
05624  */
05625 
05626 static VALUE
05627 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05628 {
05629     str = rb_str_dup(str);
05630     tr_trans(str, src, repl, 1);
05631     return str;
05632 }
05633 
05634 
05635 /*
05636  *  call-seq:
05637  *     str.count([other_str]+)   -> fixnum
05638  *
05639  *  Each <i>other_str</i> parameter defines a set of characters to count.  The
05640  *  intersection of these sets defines the characters to count in
05641  *  <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is
05642  *  negated. The sequence c1--c2 means all characters between c1 and c2.
05643  *
05644  *     a = "hello world"
05645  *     a.count "lo"            #=> 5
05646  *     a.count "lo", "o"       #=> 2
05647  *     a.count "hello", "^l"   #=> 4
05648  *     a.count "ej-m"          #=> 4
05649  */
05650 
05651 static VALUE
05652 rb_str_count(int argc, VALUE *argv, VALUE str)
05653 {
05654     char table[TR_TABLE_SIZE];
05655     rb_encoding *enc = 0;
05656     VALUE del = 0, nodel = 0;
05657     char *s, *send;
05658     int i;
05659     int ascompat;
05660 
05661     if (argc < 1) {
05662         rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05663     }
05664     for (i=0; i<argc; i++) {
05665         VALUE tstr = argv[i];
05666         unsigned char c;
05667 
05668         StringValue(tstr);
05669         enc = rb_enc_check(str, tstr);
05670         if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05671             (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05672             int n = 0;
05673 
05674             s = RSTRING_PTR(str);
05675             if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05676             send = RSTRING_END(str);
05677             while (s < send) {
05678                 if (*(unsigned char*)s++ == c) n++;
05679             }
05680             return INT2NUM(n);
05681         }
05682         tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05683     }
05684 
05685     s = RSTRING_PTR(str);
05686     if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05687     send = RSTRING_END(str);
05688     ascompat = rb_enc_asciicompat(enc);
05689     i = 0;
05690     while (s < send) {
05691         unsigned int c;
05692 
05693         if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05694             if (table[c]) {
05695                 i++;
05696             }
05697             s++;
05698         }
05699         else {
05700             int clen;
05701             c = rb_enc_codepoint_len(s, send, &clen, enc);
05702             if (tr_find(c, table, del, nodel)) {
05703                 i++;
05704             }
05705             s += clen;
05706         }
05707     }
05708 
05709     return INT2NUM(i);
05710 }
05711 
05712 static const char isspacetable[256] = {
05713     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05714     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05715     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05716     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05717     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05718     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05719     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05720     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05721     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05722     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05723     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05724     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05725     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05726     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05727     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05729 };
05730 
05731 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05732 
05733 /*
05734  *  call-seq:
05735  *     str.split(pattern=$;, [limit])   -> anArray
05736  *
05737  *  Divides <i>str</i> into substrings based on a delimiter, returning an array
05738  *  of these substrings.
05739  *
05740  *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
05741  *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
05742  *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
05743  *  of contiguous whitespace characters ignored.
05744  *
05745  *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
05746  *  pattern matches. Whenever the pattern matches a zero-length string,
05747  *  <i>str</i> is split into individual characters. If <i>pattern</i> contains
05748  *  groups, the respective matches will be returned in the array as well.
05749  *
05750  *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
05751  *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
05752  *  split on whitespace as if ` ' were specified.
05753  *
05754  *  If the <i>limit</i> parameter is omitted, trailing null fields are
05755  *  suppressed. If <i>limit</i> is a positive number, at most that number of
05756  *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
05757  *  string is returned as the only entry in an array). If negative, there is no
05758  *  limit to the number of fields returned, and trailing null fields are not
05759  *  suppressed.
05760  *
05761  *     " now's  the time".split        #=> ["now's", "the", "time"]
05762  *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
05763  *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
05764  *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
05765  *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
05766  *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
05767  *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
05768  *
05769  *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
05770  *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
05771  *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
05772  *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
05773  */
05774 
05775 static VALUE
05776 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05777 {
05778     rb_encoding *enc;
05779     VALUE spat;
05780     VALUE limit;
05781     enum {awk, string, regexp} split_type;
05782     long beg, end, i = 0;
05783     int lim = 0;
05784     VALUE result, tmp;
05785 
05786     if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05787         lim = NUM2INT(limit);
05788         if (lim <= 0) limit = Qnil;
05789         else if (lim == 1) {
05790             if (RSTRING_LEN(str) == 0)
05791                 return rb_ary_new2(0);
05792             return rb_ary_new3(1, str);
05793         }
05794         i = 1;
05795     }
05796 
05797     enc = STR_ENC_GET(str);
05798     if (NIL_P(spat)) {
05799         if (!NIL_P(rb_fs)) {
05800             spat = rb_fs;
05801             goto fs_set;
05802         }
05803         split_type = awk;
05804     }
05805     else {
05806       fs_set:
05807         if (TYPE(spat) == T_STRING) {
05808             rb_encoding *enc2 = STR_ENC_GET(spat);
05809 
05810             split_type = string;
05811             if (RSTRING_LEN(spat) == 0) {
05812                 /* Special case - split into chars */
05813                 spat = rb_reg_regcomp(spat);
05814                 split_type = regexp;
05815             }
05816             else if (rb_enc_asciicompat(enc2) == 1) {
05817                 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05818                     split_type = awk;
05819                 }
05820             }
05821             else {
05822                 int l;
05823                 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05824                     RSTRING_LEN(spat) == l) {
05825                     split_type = awk;
05826                 }
05827             }
05828         }
05829         else {
05830             spat = get_pat(spat, 1);
05831             split_type = regexp;
05832         }
05833     }
05834 
05835     result = rb_ary_new();
05836     beg = 0;
05837     if (split_type == awk) {
05838         char *ptr = RSTRING_PTR(str);
05839         char *eptr = RSTRING_END(str);
05840         char *bptr = ptr;
05841         int skip = 1;
05842         unsigned int c;
05843 
05844         end = beg;
05845         if (is_ascii_string(str)) {
05846             while (ptr < eptr) {
05847                 c = (unsigned char)*ptr++;
05848                 if (skip) {
05849                     if (ascii_isspace(c)) {
05850                         beg = ptr - bptr;
05851                     }
05852                     else {
05853                         end = ptr - bptr;
05854                         skip = 0;
05855                         if (!NIL_P(limit) && lim <= i) break;
05856                     }
05857                 }
05858                 else if (ascii_isspace(c)) {
05859                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05860                     skip = 1;
05861                     beg = ptr - bptr;
05862                     if (!NIL_P(limit)) ++i;
05863                 }
05864                 else {
05865                     end = ptr - bptr;
05866                 }
05867             }
05868         }
05869         else {
05870             while (ptr < eptr) {
05871                 int n;
05872 
05873                 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05874                 ptr += n;
05875                 if (skip) {
05876                     if (rb_isspace(c)) {
05877                         beg = ptr - bptr;
05878                     }
05879                     else {
05880                         end = ptr - bptr;
05881                         skip = 0;
05882                         if (!NIL_P(limit) && lim <= i) break;
05883                     }
05884                 }
05885                 else if (rb_isspace(c)) {
05886                     rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05887                     skip = 1;
05888                     beg = ptr - bptr;
05889                     if (!NIL_P(limit)) ++i;
05890                 }
05891                 else {
05892                     end = ptr - bptr;
05893                 }
05894             }
05895         }
05896     }
05897     else if (split_type == string) {
05898         char *ptr = RSTRING_PTR(str);
05899         char *temp = ptr;
05900         char *eptr = RSTRING_END(str);
05901         char *sptr = RSTRING_PTR(spat);
05902         long slen = RSTRING_LEN(spat);
05903 
05904         if (is_broken_string(str)) {
05905             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05906         }
05907         if (is_broken_string(spat)) {
05908             rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05909         }
05910         enc = rb_enc_check(str, spat);
05911         while (ptr < eptr &&
05912                (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05913             /* Check we are at the start of a char */
05914             char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05915             if (t != ptr + end) {
05916                 ptr = t;
05917                 continue;
05918             }
05919             rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05920             ptr += end + slen;
05921             if (!NIL_P(limit) && lim <= ++i) break;
05922         }
05923         beg = ptr - temp;
05924     }
05925     else {
05926         char *ptr = RSTRING_PTR(str);
05927         long len = RSTRING_LEN(str);
05928         long start = beg;
05929         long idx;
05930         int last_null = 0;
05931         struct re_registers *regs;
05932 
05933         while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05934             regs = RMATCH_REGS(rb_backref_get());
05935             if (start == end && BEG(0) == END(0)) {
05936                 if (!ptr) {
05937                     rb_ary_push(result, str_new_empty(str));
05938                     break;
05939                 }
05940                 else if (last_null == 1) {
05941                     rb_ary_push(result, rb_str_subseq(str, beg,
05942                                                       rb_enc_fast_mbclen(ptr+beg,
05943                                                                          ptr+len,
05944                                                                          enc)));
05945                     beg = start;
05946                 }
05947                 else {
05948                     if (ptr+start == ptr+len)
05949                         start++;
05950                     else
05951                         start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05952                     last_null = 1;
05953                     continue;
05954                 }
05955             }
05956             else {
05957                 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05958                 beg = start = END(0);
05959             }
05960             last_null = 0;
05961 
05962             for (idx=1; idx < regs->num_regs; idx++) {
05963                 if (BEG(idx) == -1) continue;
05964                 if (BEG(idx) == END(idx))
05965                     tmp = str_new_empty(str);
05966                 else
05967                     tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05968                 rb_ary_push(result, tmp);
05969             }
05970             if (!NIL_P(limit) && lim <= ++i) break;
05971         }
05972     }
05973     if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05974         if (RSTRING_LEN(str) == beg)
05975             tmp = str_new_empty(str);
05976         else
05977             tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05978         rb_ary_push(result, tmp);
05979     }
05980     if (NIL_P(limit) && lim == 0) {
05981         long len;
05982         while ((len = RARRAY_LEN(result)) > 0 &&
05983                (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05984             rb_ary_pop(result);
05985     }
05986 
05987     return result;
05988 }
05989 
05990 VALUE
05991 rb_str_split(VALUE str, const char *sep0)
05992 {
05993     VALUE sep;
05994 
05995     StringValue(str);
05996     sep = rb_str_new2(sep0);
05997     return rb_str_split_m(1, &sep, str);
05998 }
05999 
06000 
06001 /*
06002  *  call-seq:
06003  *     str.each_line(separator=$/) {|substr| block }   -> str
06004  *     str.each_line(separator=$/)                     -> an_enumerator
06005  *
06006  *     str.lines(separator=$/) {|substr| block }       -> str
06007  *     str.lines(separator=$/)                         -> an_enumerator
06008  *
06009  *  Splits <i>str</i> using the supplied parameter as the record separator
06010  *  (<code>$/</code> by default), passing each substring in turn to the supplied
06011  *  block. If a zero-length record separator is supplied, the string is split
06012  *  into paragraphs delimited by multiple successive newlines.
06013  *
06014  *  If no block is given, an enumerator is returned instead.
06015  *
06016  *     print "Example one\n"
06017  *     "hello\nworld".each_line {|s| p s}
06018  *     print "Example two\n"
06019  *     "hello\nworld".each_line('l') {|s| p s}
06020  *     print "Example three\n"
06021  *     "hello\n\n\nworld".each_line('') {|s| p s}
06022  *
06023  *  <em>produces:</em>
06024  *
06025  *     Example one
06026  *     "hello\n"
06027  *     "world"
06028  *     Example two
06029  *     "hel"
06030  *     "l"
06031  *     "o\nworl"
06032  *     "d"
06033  *     Example three
06034  *     "hello\n\n\n"
06035  *     "world"
06036  */
06037 
06038 static VALUE
06039 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06040 {
06041     rb_encoding *enc;
06042     VALUE rs;
06043     unsigned int newline;
06044     const char *p, *pend, *s, *ptr;
06045     long len, rslen;
06046     VALUE line;
06047     int n;
06048     VALUE orig = str;
06049 
06050     if (argc == 0) {
06051         rs = rb_rs;
06052     }
06053     else {
06054         rb_scan_args(argc, argv, "01", &rs);
06055     }
06056     RETURN_ENUMERATOR(str, argc, argv);
06057     if (NIL_P(rs)) {
06058         rb_yield(str);
06059         return orig;
06060     }
06061     str = rb_str_new4(str);
06062     ptr = p = s = RSTRING_PTR(str);
06063     pend = p + RSTRING_LEN(str);
06064     len = RSTRING_LEN(str);
06065     StringValue(rs);
06066     if (rs == rb_default_rs) {
06067         enc = rb_enc_get(str);
06068         while (p < pend) {
06069             char *p0;
06070 
06071             p = memchr(p, '\n', pend - p);
06072             if (!p) break;
06073             p0 = rb_enc_left_char_head(s, p, pend, enc);
06074             if (!rb_enc_is_newline(p0, pend, enc)) {
06075                 p++;
06076                 continue;
06077             }
06078             p = p0 + rb_enc_mbclen(p0, pend, enc);
06079             line = rb_str_new5(str, s, p - s);
06080             OBJ_INFECT(line, str);
06081             rb_enc_cr_str_copy_for_substr(line, str);
06082             rb_yield(line);
06083             str_mod_check(str, ptr, len);
06084             s = p;
06085         }
06086         goto finish;
06087     }
06088 
06089     enc = rb_enc_check(str, rs);
06090     rslen = RSTRING_LEN(rs);
06091     if (rslen == 0) {
06092         newline = '\n';
06093     }
06094     else {
06095         newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06096     }
06097 
06098     while (p < pend) {
06099         unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06100 
06101       again:
06102         if (rslen == 0 && c == newline) {
06103             p += n;
06104             if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06105                 goto again;
06106             }
06107             while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06108                 p += n;
06109             }
06110             p -= n;
06111         }
06112         if (c == newline &&
06113             (rslen <= 1 ||
06114              (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06115             line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
06116             OBJ_INFECT(line, str);
06117             rb_enc_cr_str_copy_for_substr(line, str);
06118             rb_yield(line);
06119             str_mod_check(str, ptr, len);
06120             s = p + (rslen ? rslen : n);
06121         }
06122         p += n;
06123     }
06124 
06125   finish:
06126     if (s != pend) {
06127         line = rb_str_new5(str, s, pend - s);
06128         OBJ_INFECT(line, str);
06129         rb_enc_cr_str_copy_for_substr(line, str);
06130         rb_yield(line);
06131     }
06132 
06133     return orig;
06134 }
06135 
06136 
06137 /*
06138  *  call-seq:
06139  *     str.bytes {|fixnum| block }        -> str
06140  *     str.bytes                          -> an_enumerator
06141  *
06142  *     str.each_byte {|fixnum| block }    -> str
06143  *     str.each_byte                      -> an_enumerator
06144  *
06145  *  Passes each byte in <i>str</i> to the given block, or returns
06146  *  an enumerator if no block is given.
06147  *
06148  *     "hello".each_byte {|c| print c, ' ' }
06149  *
06150  *  <em>produces:</em>
06151  *
06152  *     104 101 108 108 111
06153  */
06154 
06155 static VALUE
06156 rb_str_each_byte(VALUE str)
06157 {
06158     long i;
06159 
06160     RETURN_ENUMERATOR(str, 0, 0);
06161     for (i=0; i<RSTRING_LEN(str); i++) {
06162         rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06163     }
06164     return str;
06165 }
06166 
06167 
06168 /*
06169  *  call-seq:
06170  *     str.chars {|cstr| block }        -> str
06171  *     str.chars                        -> an_enumerator
06172  *
06173  *     str.each_char {|cstr| block }    -> str
06174  *     str.each_char                    -> an_enumerator
06175  *
06176  *  Passes each character in <i>str</i> to the given block, or returns
06177  *  an enumerator if no block is given.
06178  *
06179  *     "hello".each_char {|c| print c, ' ' }
06180  *
06181  *  <em>produces:</em>
06182  *
06183  *     h e l l o
06184  */
06185 
06186 static VALUE
06187 rb_str_each_char(VALUE str)
06188 {
06189     VALUE orig = str;
06190     long i, len, n;
06191     const char *ptr;
06192     rb_encoding *enc;
06193 
06194     RETURN_ENUMERATOR(str, 0, 0);
06195     str = rb_str_new4(str);
06196     ptr = RSTRING_PTR(str);
06197     len = RSTRING_LEN(str);
06198     enc = rb_enc_get(str);
06199     switch (ENC_CODERANGE(str)) {
06200       case ENC_CODERANGE_VALID:
06201       case ENC_CODERANGE_7BIT:
06202         for (i = 0; i < len; i += n) {
06203             n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06204             rb_yield(rb_str_subseq(str, i, n));
06205         }
06206         break;
06207       default:
06208         for (i = 0; i < len; i += n) {
06209             n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06210             rb_yield(rb_str_subseq(str, i, n));
06211         }
06212     }
06213     return orig;
06214 }
06215 
06216 /*
06217  *  call-seq:
06218  *     str.codepoints {|integer| block }        -> str
06219  *     str.codepoints                           -> an_enumerator
06220  *
06221  *     str.each_codepoint {|integer| block }    -> str
06222  *     str.each_codepoint                       -> an_enumerator
06223  *
06224  *  Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
06225  *  also known as a <i>codepoint</i> when applied to Unicode strings to the
06226  *  given block.
06227  *
06228  *  If no block is given, an enumerator is returned instead.
06229  *
06230  *     "hello\u0639".each_codepoint {|c| print c, ' ' }
06231  *
06232  *  <em>produces:</em>
06233  *
06234  *     104 101 108 108 111 1593
06235  */
06236 
06237 static VALUE
06238 rb_str_each_codepoint(VALUE str)
06239 {
06240     VALUE orig = str;
06241     int n;
06242     unsigned int c;
06243     const char *ptr, *end;
06244     rb_encoding *enc;
06245 
06246     if (single_byte_optimizable(str)) return rb_str_each_byte(str);
06247     RETURN_ENUMERATOR(str, 0, 0);
06248     str = rb_str_new4(str);
06249     ptr = RSTRING_PTR(str);
06250     end = RSTRING_END(str);
06251     enc = STR_ENC_GET(str);
06252     while (ptr < end) {
06253         c = rb_enc_codepoint_len(ptr, end, &n, enc);
06254         rb_yield(UINT2NUM(c));
06255         ptr += n;
06256     }
06257     return orig;
06258 }
06259 
06260 static long
06261 chopped_length(VALUE str)
06262 {
06263     rb_encoding *enc = STR_ENC_GET(str);
06264     const char *p, *p2, *beg, *end;
06265 
06266     beg = RSTRING_PTR(str);
06267     end = beg + RSTRING_LEN(str);
06268     if (beg > end) return 0;
06269     p = rb_enc_prev_char(beg, end, end, enc);
06270     if (!p) return 0;
06271     if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06272         p2 = rb_enc_prev_char(beg, p, end, enc);
06273         if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06274     }
06275     return p - beg;
06276 }
06277 
06278 /*
06279  *  call-seq:
06280  *     str.chop!   -> str or nil
06281  *
06282  *  Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
06283  *  or <code>nil</code> if <i>str</i> is the empty string.  See also
06284  *  <code>String#chomp!</code>.
06285  */
06286 
06287 static VALUE
06288 rb_str_chop_bang(VALUE str)
06289 {
06290     str_modify_keep_cr(str);
06291     if (RSTRING_LEN(str) > 0) {
06292         long len;
06293         len = chopped_length(str);
06294         STR_SET_LEN(str, len);
06295         RSTRING_PTR(str)[len] = '\0';
06296         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06297             ENC_CODERANGE_CLEAR(str);
06298         }
06299         return str;
06300     }
06301     return Qnil;
06302 }
06303 
06304 
06305 /*
06306  *  call-seq:
06307  *     str.chop   -> new_str
06308  *
06309  *  Returns a new <code>String</code> with the last character removed.  If the
06310  *  string ends with <code>\r\n</code>, both characters are removed. Applying
06311  *  <code>chop</code> to an empty string returns an empty
06312  *  string. <code>String#chomp</code> is often a safer alternative, as it leaves
06313  *  the string unchanged if it doesn't end in a record separator.
06314  *
06315  *     "string\r\n".chop   #=> "string"
06316  *     "string\n\r".chop   #=> "string\n"
06317  *     "string\n".chop     #=> "string"
06318  *     "string".chop       #=> "strin"
06319  *     "x".chop.chop       #=> ""
06320  */
06321 
06322 static VALUE
06323 rb_str_chop(VALUE str)
06324 {
06325     VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06326     rb_enc_cr_str_copy_for_substr(str2, str);
06327     OBJ_INFECT(str2, str);
06328     return str2;
06329 }
06330 
06331 
06332 /*
06333  *  call-seq:
06334  *     str.chomp!(separator=$/)   -> str or nil
06335  *
06336  *  Modifies <i>str</i> in place as described for <code>String#chomp</code>,
06337  *  returning <i>str</i>, or <code>nil</code> if no modifications were made.
06338  */
06339 
06340 static VALUE
06341 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06342 {
06343     rb_encoding *enc;
06344     VALUE rs;
06345     int newline;
06346     char *p, *pp, *e;
06347     long len, rslen;
06348 
06349     str_modify_keep_cr(str);
06350     len = RSTRING_LEN(str);
06351     if (len == 0) return Qnil;
06352     p = RSTRING_PTR(str);
06353     e = p + len;
06354     if (argc == 0) {
06355         rs = rb_rs;
06356         if (rs == rb_default_rs) {
06357           smart_chomp:
06358             enc = rb_enc_get(str);
06359             if (rb_enc_mbminlen(enc) > 1) {
06360                 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06361                 if (rb_enc_is_newline(pp, e, enc)) {
06362                     e = pp;
06363                 }
06364                 pp = e - rb_enc_mbminlen(enc);
06365                 if (pp >= p) {
06366                     pp = rb_enc_left_char_head(p, pp, e, enc);
06367                     if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06368                         e = pp;
06369                     }
06370                 }
06371                 if (e == RSTRING_END(str)) {
06372                     return Qnil;
06373                 }
06374                 len = e - RSTRING_PTR(str);
06375                 STR_SET_LEN(str, len);
06376             }
06377             else {
06378                 if (RSTRING_PTR(str)[len-1] == '\n') {
06379                     STR_DEC_LEN(str);
06380                     if (RSTRING_LEN(str) > 0 &&
06381                         RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06382                         STR_DEC_LEN(str);
06383                     }
06384                 }
06385                 else if (RSTRING_PTR(str)[len-1] == '\r') {
06386                     STR_DEC_LEN(str);
06387                 }
06388                 else {
06389                     return Qnil;
06390                 }
06391             }
06392             RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06393             return str;
06394         }
06395     }
06396     else {
06397         rb_scan_args(argc, argv, "01", &rs);
06398     }
06399     if (NIL_P(rs)) return Qnil;
06400     StringValue(rs);
06401     rslen = RSTRING_LEN(rs);
06402     if (rslen == 0) {
06403         while (len>0 && p[len-1] == '\n') {
06404             len--;
06405             if (len>0 && p[len-1] == '\r')
06406                 len--;
06407         }
06408         if (len < RSTRING_LEN(str)) {
06409             STR_SET_LEN(str, len);
06410             RSTRING_PTR(str)[len] = '\0';
06411             return str;
06412         }
06413         return Qnil;
06414     }
06415     if (rslen > len) return Qnil;
06416     newline = RSTRING_PTR(rs)[rslen-1];
06417     if (rslen == 1 && newline == '\n')
06418         goto smart_chomp;
06419 
06420     enc = rb_enc_check(str, rs);
06421     if (is_broken_string(rs)) {
06422         return Qnil;
06423     }
06424     pp = e - rslen;
06425     if (p[len-1] == newline &&
06426         (rslen <= 1 ||
06427          memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06428         if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06429             return Qnil;
06430         if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06431             ENC_CODERANGE_CLEAR(str);
06432         }
06433         STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06434         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06435         return str;
06436     }
06437     return Qnil;
06438 }
06439 
06440 
06441 /*
06442  *  call-seq:
06443  *     str.chomp(separator=$/)   -> new_str
06444  *
06445  *  Returns a new <code>String</code> with the given record separator removed
06446  *  from the end of <i>str</i> (if present). If <code>$/</code> has not been
06447  *  changed from the default Ruby record separator, then <code>chomp</code> also
06448  *  removes carriage return characters (that is it will remove <code>\n</code>,
06449  *  <code>\r</code>, and <code>\r\n</code>).
06450  *
06451  *     "hello".chomp            #=> "hello"
06452  *     "hello\n".chomp          #=> "hello"
06453  *     "hello\r\n".chomp        #=> "hello"
06454  *     "hello\n\r".chomp        #=> "hello\n"
06455  *     "hello\r".chomp          #=> "hello"
06456  *     "hello \n there".chomp   #=> "hello \n there"
06457  *     "hello".chomp("llo")     #=> "he"
06458  */
06459 
06460 static VALUE
06461 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06462 {
06463     str = rb_str_dup(str);
06464     rb_str_chomp_bang(argc, argv, str);
06465     return str;
06466 }
06467 
06468 /*
06469  *  call-seq:
06470  *     str.lstrip!   -> self or nil
06471  *
06472  *  Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
06473  *  change was made. See also <code>String#rstrip!</code> and
06474  *  <code>String#strip!</code>.
06475  *
06476  *     "  hello  ".lstrip   #=> "hello  "
06477  *     "hello".lstrip!      #=> nil
06478  */
06479 
06480 static VALUE
06481 rb_str_lstrip_bang(VALUE str)
06482 {
06483     rb_encoding *enc;
06484     char *s, *t, *e;
06485 
06486     str_modify_keep_cr(str);
06487     enc = STR_ENC_GET(str);
06488     s = RSTRING_PTR(str);
06489     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06490     e = t = RSTRING_END(str);
06491     /* remove spaces at head */
06492     while (s < e) {
06493         int n;
06494         unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06495 
06496         if (!rb_isspace(cc)) break;
06497         s += n;
06498     }
06499 
06500     if (s > RSTRING_PTR(str)) {
06501         STR_SET_LEN(str, t-s);
06502         memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06503         RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06504         return str;
06505     }
06506     return Qnil;
06507 }
06508 
06509 
06510 /*
06511  *  call-seq:
06512  *     str.lstrip   -> new_str
06513  *
06514  *  Returns a copy of <i>str</i> with leading whitespace removed. See also
06515  *  <code>String#rstrip</code> and <code>String#strip</code>.
06516  *
06517  *     "  hello  ".lstrip   #=> "hello  "
06518  *     "hello".lstrip       #=> "hello"
06519  */
06520 
06521 static VALUE
06522 rb_str_lstrip(VALUE str)
06523 {
06524     str = rb_str_dup(str);
06525     rb_str_lstrip_bang(str);
06526     return str;
06527 }
06528 
06529 
06530 /*
06531  *  call-seq:
06532  *     str.rstrip!   -> self or nil
06533  *
06534  *  Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
06535  *  no change was made. See also <code>String#lstrip!</code> and
06536  *  <code>String#strip!</code>.
06537  *
06538  *     "  hello  ".rstrip   #=> "  hello"
06539  *     "hello".rstrip!      #=> nil
06540  */
06541 
06542 static VALUE
06543 rb_str_rstrip_bang(VALUE str)
06544 {
06545     rb_encoding *enc;
06546     char *s, *t, *e;
06547 
06548     str_modify_keep_cr(str);
06549     enc = STR_ENC_GET(str);
06550     rb_str_check_dummy_enc(enc);
06551     s = RSTRING_PTR(str);
06552     if (!s || RSTRING_LEN(str) == 0) return Qnil;
06553     t = e = RSTRING_END(str);
06554 
06555     /* remove trailing spaces or '\0's */
06556     if (single_byte_optimizable(str)) {
06557         unsigned char c;
06558         while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06559     }
06560     else {
06561         char *tp;
06562 
06563         while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06564             unsigned int c = rb_enc_codepoint(tp, e, enc);
06565             if (c && !rb_isspace(c)) break;
06566             t = tp;
06567         }
06568     }
06569     if (t < e) {
06570         long len = t-RSTRING_PTR(str);
06571 
06572         STR_SET_LEN(str, len);
06573         RSTRING_PTR(str)[len] = '\0';
06574         return str;
06575     }
06576     return Qnil;
06577 }
06578 
06579 
06580 /*
06581  *  call-seq:
06582  *     str.rstrip   -> new_str
06583  *
06584  *  Returns a copy of <i>str</i> with trailing whitespace removed. See also
06585  *  <code>String#lstrip</code> and <code>String#strip</code>.
06586  *
06587  *     "  hello  ".rstrip   #=> "  hello"
06588  *     "hello".rstrip       #=> "hello"
06589  */
06590 
06591 static VALUE
06592 rb_str_rstrip(VALUE str)
06593 {
06594     str = rb_str_dup(str);
06595     rb_str_rstrip_bang(str);
06596     return str;
06597 }
06598 
06599 
06600 /*
06601  *  call-seq:
06602  *     str.strip!   -> str or nil
06603  *
06604  *  Removes leading and trailing whitespace from <i>str</i>. Returns
06605  *  <code>nil</code> if <i>str</i> was not altered.
06606  */
06607 
06608 static VALUE
06609 rb_str_strip_bang(VALUE str)
06610 {
06611     VALUE l = rb_str_lstrip_bang(str);
06612     VALUE r = rb_str_rstrip_bang(str);
06613 
06614     if (NIL_P(l) && NIL_P(r)) return Qnil;
06615     return str;
06616 }
06617 
06618 
06619 /*
06620  *  call-seq:
06621  *     str.strip   -> new_str
06622  *
06623  *  Returns a copy of <i>str</i> with leading and trailing whitespace removed.
06624  *
06625  *     "    hello    ".strip   #=> "hello"
06626  *     "\tgoodbye\r\n".strip   #=> "goodbye"
06627  */
06628 
06629 static VALUE
06630 rb_str_strip(VALUE str)
06631 {
06632     str = rb_str_dup(str);
06633     rb_str_strip_bang(str);
06634     return str;
06635 }
06636 
06637 static VALUE
06638 scan_once(VALUE str, VALUE pat, long *start)
06639 {
06640     VALUE result, match;
06641     struct re_registers *regs;
06642     int i;
06643 
06644     if (rb_reg_search(pat, str, *start, 0) >= 0) {
06645         match = rb_backref_get();
06646         regs = RMATCH_REGS(match);
06647         if (BEG(0) == END(0)) {
06648             rb_encoding *enc = STR_ENC_GET(str);
06649             /*
06650              * Always consume at least one character of the input string
06651              */
06652             if (RSTRING_LEN(str) > END(0))
06653                 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06654                                                    RSTRING_END(str), enc);
06655             else
06656                 *start = END(0)+1;
06657         }
06658         else {
06659             *start = END(0);
06660         }
06661         if (regs->num_regs == 1) {
06662             return rb_reg_nth_match(0, match);
06663         }
06664         result = rb_ary_new2(regs->num_regs);
06665         for (i=1; i < regs->num_regs; i++) {
06666             rb_ary_push(result, rb_reg_nth_match(i, match));
06667         }
06668 
06669         return result;
06670     }
06671     return Qnil;
06672 }
06673 
06674 
06675 /*
06676  *  call-seq:
06677  *     str.scan(pattern)                         -> array
06678  *     str.scan(pattern) {|match, ...| block }   -> str
06679  *
06680  *  Both forms iterate through <i>str</i>, matching the pattern (which may be a
06681  *  <code>Regexp</code> or a <code>String</code>). For each match, a result is
06682  *  generated and either added to the result array or passed to the block. If
06683  *  the pattern contains no groups, each individual result consists of the
06684  *  matched string, <code>$&</code>.  If the pattern contains groups, each
06685  *  individual result is itself an array containing one entry per group.
06686  *
06687  *     a = "cruel world"
06688  *     a.scan(/\w+/)        #=> ["cruel", "world"]
06689  *     a.scan(/.../)        #=> ["cru", "el ", "wor"]
06690  *     a.scan(/(...)/)      #=> [["cru"], ["el "], ["wor"]]
06691  *     a.scan(/(..)(..)/)   #=> [["cr", "ue"], ["l ", "wo"]]
06692  *
06693  *  And the block form:
06694  *
06695  *     a.scan(/\w+/) {|w| print "<<#{w}>> " }
06696  *     print "\n"
06697  *     a.scan(/(.)(.)/) {|x,y| print y, x }
06698  *     print "\n"
06699  *
06700  *  <em>produces:</em>
06701  *
06702  *     <<cruel>> <<world>>
06703  *     rceu lowlr
06704  */
06705 
06706 static VALUE
06707 rb_str_scan(VALUE str, VALUE pat)
06708 {
06709     VALUE result;
06710     long start = 0;
06711     long last = -1, prev = 0;
06712     char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06713 
06714     pat = get_pat(pat, 1);
06715     if (!rb_block_given_p()) {
06716         VALUE ary = rb_ary_new();
06717 
06718         while (!NIL_P(result = scan_once(str, pat, &start))) {
06719             last = prev;
06720             prev = start;
06721             rb_ary_push(ary, result);
06722         }
06723         if (last >= 0) rb_reg_search(pat, str, last, 0);
06724         return ary;
06725     }
06726 
06727     while (!NIL_P(result = scan_once(str, pat, &start))) {
06728         last = prev;
06729         prev = start;
06730         rb_yield(result);
06731         str_mod_check(str, p, len);
06732     }
06733     if (last >= 0) rb_reg_search(pat, str, last, 0);
06734     return str;
06735 }
06736 
06737 
06738 /*
06739  *  call-seq:
06740  *     str.hex   -> integer
06741  *
06742  *  Treats leading characters from <i>str</i> as a string of hexadecimal digits
06743  *  (with an optional sign and an optional <code>0x</code>) and returns the
06744  *  corresponding number. Zero is returned on error.
06745  *
06746  *     "0x0a".hex     #=> 10
06747  *     "-1234".hex    #=> -4660
06748  *     "0".hex        #=> 0
06749  *     "wombat".hex   #=> 0
06750  */
06751 
06752 static VALUE
06753 rb_str_hex(VALUE str)
06754 {
06755     rb_encoding *enc = rb_enc_get(str);
06756 
06757     if (!rb_enc_asciicompat(enc)) {
06758         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06759     }
06760     return rb_str_to_inum(str, 16, FALSE);
06761 }
06762 
06763 
06764 /*
06765  *  call-seq:
06766  *     str.oct   -> integer
06767  *
06768  *  Treats leading characters of <i>str</i> as a string of octal digits (with an
06769  *  optional sign) and returns the corresponding number.  Returns 0 if the
06770  *  conversion fails.
06771  *
06772  *     "123".oct       #=> 83
06773  *     "-377".oct      #=> -255
06774  *     "bad".oct       #=> 0
06775  *     "0377bad".oct   #=> 255
06776  */
06777 
06778 static VALUE
06779 rb_str_oct(VALUE str)
06780 {
06781     rb_encoding *enc = rb_enc_get(str);
06782 
06783     if (!rb_enc_asciicompat(enc)) {
06784         rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06785     }
06786     return rb_str_to_inum(str, -8, FALSE);
06787 }
06788 
06789 
06790 /*
06791  *  call-seq:
06792  *     str.crypt(other_str)   -> new_str
06793  *
06794  *  Applies a one-way cryptographic hash to <i>str</i> by invoking the standard
06795  *  library function <code>crypt</code>. The argument is the salt string, which
06796  *  should be two characters long, each character drawn from
06797  *  <code>[a-zA-Z0-9./]</code>.
06798  */
06799 
06800 static VALUE
06801 rb_str_crypt(VALUE str, VALUE salt)
06802 {
06803     extern char *crypt(const char *, const char *);
06804     VALUE result;
06805     const char *s, *saltp;
06806     char *res;
06807 #ifdef BROKEN_CRYPT
06808     char salt_8bit_clean[3];
06809 #endif
06810 
06811     StringValue(salt);
06812     if (RSTRING_LEN(salt) < 2)
06813         rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06814 
06815     s = RSTRING_PTR(str);
06816     if (!s) s = "";
06817     saltp = RSTRING_PTR(salt);
06818 #ifdef BROKEN_CRYPT
06819     if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06820         salt_8bit_clean[0] = saltp[0] & 0x7f;
06821         salt_8bit_clean[1] = saltp[1] & 0x7f;
06822         salt_8bit_clean[2] = '\0';
06823         saltp = salt_8bit_clean;
06824     }
06825 #endif
06826     res = crypt(s, saltp);
06827     if (!res) {
06828         rb_sys_fail("crypt");
06829     }
06830     result = rb_str_new2(res);
06831     OBJ_INFECT(result, str);
06832     OBJ_INFECT(result, salt);
06833     return result;
06834 }
06835 
06836 
06837 /*
06838  *  call-seq:
06839  *     str.intern   -> symbol
06840  *     str.to_sym   -> symbol
06841  *
06842  *  Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
06843  *  symbol if it did not previously exist. See <code>Symbol#id2name</code>.
06844  *
06845  *     "Koala".intern         #=> :Koala
06846  *     s = 'cat'.to_sym       #=> :cat
06847  *     s == :cat              #=> true
06848  *     s = '@cat'.to_sym      #=> :@cat
06849  *     s == :@cat             #=> true
06850  *
06851  *  This can also be used to create symbols that cannot be represented using the
06852  *  <code>:xxx</code> notation.
06853  *
06854  *     'cat and dog'.to_sym   #=> :"cat and dog"
06855  */
06856 
06857 VALUE
06858 rb_str_intern(VALUE s)
06859 {
06860     VALUE str = RB_GC_GUARD(s);
06861     ID id;
06862 
06863     id = rb_intern_str(str);
06864     return ID2SYM(id);
06865 }
06866 
06867 
06868 /*
06869  *  call-seq:
06870  *     str.ord   -> integer
06871  *
06872  *  Return the <code>Integer</code> ordinal of a one-character string.
06873  *
06874  *     "a".ord         #=> 97
06875  */
06876 
06877 VALUE
06878 rb_str_ord(VALUE s)
06879 {
06880     unsigned int c;
06881 
06882     c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06883     return UINT2NUM(c);
06884 }
06885 /*
06886  *  call-seq:
06887  *     str.sum(n=16)   -> integer
06888  *
06889  *  Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
06890  *  where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting
06891  *  to 16. The result is simply the sum of the binary value of each character in
06892  *  <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
06893  *  checksum.
06894  */
06895 
06896 static VALUE
06897 rb_str_sum(int argc, VALUE *argv, VALUE str)
06898 {
06899     VALUE vbits;
06900     int bits;
06901     char *ptr, *p, *pend;
06902     long len;
06903     VALUE sum = INT2FIX(0);
06904     unsigned long sum0 = 0;
06905 
06906     if (argc == 0) {
06907         bits = 16;
06908     }
06909     else {
06910         rb_scan_args(argc, argv, "01", &vbits);
06911         bits = NUM2INT(vbits);
06912     }
06913     ptr = p = RSTRING_PTR(str);
06914     len = RSTRING_LEN(str);
06915     pend = p + len;
06916 
06917     while (p < pend) {
06918         if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06919             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06920             str_mod_check(str, ptr, len);
06921             sum0 = 0;
06922         }
06923         sum0 += (unsigned char)*p;
06924         p++;
06925     }
06926 
06927     if (bits == 0) {
06928         if (sum0) {
06929             sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06930         }
06931     }
06932     else {
06933         if (sum == INT2FIX(0)) {
06934             if (bits < (int)sizeof(long)*CHAR_BIT) {
06935                 sum0 &= (((unsigned long)1)<<bits)-1;
06936             }
06937             sum = LONG2FIX(sum0);
06938         }
06939         else {
06940             VALUE mod;
06941 
06942             if (sum0) {
06943                 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06944             }
06945 
06946             mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06947             mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06948             sum = rb_funcall(sum, '&', 1, mod);
06949         }
06950     }
06951     return sum;
06952 }
06953 
06954 static VALUE
06955 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06956 {
06957     rb_encoding *enc;
06958     VALUE w;
06959     long width, len, flen = 1, fclen = 1;
06960     VALUE res;
06961     char *p;
06962     const char *f = " ";
06963     long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06964     volatile VALUE pad;
06965     int singlebyte = 1, cr;
06966 
06967     rb_scan_args(argc, argv, "11", &w, &pad);
06968     enc = STR_ENC_GET(str);
06969     width = NUM2LONG(w);
06970     if (argc == 2) {
06971         StringValue(pad);
06972         enc = rb_enc_check(str, pad);
06973         f = RSTRING_PTR(pad);
06974         flen = RSTRING_LEN(pad);
06975         fclen = str_strlen(pad, enc);
06976         singlebyte = single_byte_optimizable(pad);
06977         if (flen == 0 || fclen == 0) {
06978             rb_raise(rb_eArgError, "zero width padding");
06979         }
06980     }
06981     len = str_strlen(str, enc);
06982     if (width < 0 || len >= width) return rb_str_dup(str);
06983     n = width - len;
06984     llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06985     rlen = n - llen;
06986     cr = ENC_CODERANGE(str);
06987     if (flen > 1) {
06988        llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06989        rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06990     }
06991     size = RSTRING_LEN(str);
06992     if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06993        (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06994        (len += llen2 + rlen2) >= LONG_MAX - size) {
06995        rb_raise(rb_eArgError, "argument too big");
06996     }
06997     len += size;
06998     res = rb_str_new5(str, 0, len);
06999     p = RSTRING_PTR(res);
07000     if (flen <= 1) {
07001        memset(p, *f, llen);
07002        p += llen;
07003     }
07004     else {
07005        while (llen >= fclen) {
07006             memcpy(p,f,flen);
07007             p += flen;
07008             llen -= fclen;
07009         }
07010        if (llen > 0) {
07011            memcpy(p, f, llen2);
07012            p += llen2;
07013         }
07014     }
07015     memcpy(p, RSTRING_PTR(str), size);
07016     p += size;
07017     if (flen <= 1) {
07018        memset(p, *f, rlen);
07019        p += rlen;
07020     }
07021     else {
07022        while (rlen >= fclen) {
07023             memcpy(p,f,flen);
07024             p += flen;
07025             rlen -= fclen;
07026         }
07027        if (rlen > 0) {
07028            memcpy(p, f, rlen2);
07029            p += rlen2;
07030         }
07031     }
07032     *p = '\0';
07033     STR_SET_LEN(res, p-RSTRING_PTR(res));
07034     OBJ_INFECT(res, str);
07035     if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07036     rb_enc_associate(res, enc);
07037     if (argc == 2)
07038         cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07039     if (cr != ENC_CODERANGE_BROKEN)
07040         ENC_CODERANGE_SET(res, cr);
07041     return res;
07042 }
07043 
07044 
07045 /*
07046  *  call-seq:
07047  *     str.ljust(integer, padstr=' ')   -> new_str
07048  *
07049  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07050  *  <code>String</code> of length <i>integer</i> with <i>str</i> left justified
07051  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07052  *
07053  *     "hello".ljust(4)            #=> "hello"
07054  *     "hello".ljust(20)           #=> "hello               "
07055  *     "hello".ljust(20, '1234')   #=> "hello123412341234123"
07056  */
07057 
07058 static VALUE
07059 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07060 {
07061     return rb_str_justify(argc, argv, str, 'l');
07062 }
07063 
07064 
07065 /*
07066  *  call-seq:
07067  *     str.rjust(integer, padstr=' ')   -> new_str
07068  *
07069  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07070  *  <code>String</code> of length <i>integer</i> with <i>str</i> right justified
07071  *  and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07072  *
07073  *     "hello".rjust(4)            #=> "hello"
07074  *     "hello".rjust(20)           #=> "               hello"
07075  *     "hello".rjust(20, '1234')   #=> "123412341234123hello"
07076  */
07077 
07078 static VALUE
07079 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07080 {
07081     return rb_str_justify(argc, argv, str, 'r');
07082 }
07083 
07084 
07085 /*
07086  *  call-seq:
07087  *     str.center(integer, padstr)   -> new_str
07088  *
07089  *  If <i>integer</i> is greater than the length of <i>str</i>, returns a new
07090  *  <code>String</code> of length <i>integer</i> with <i>str</i> centered and
07091  *  padded with <i>padstr</i>; otherwise, returns <i>str</i>.
07092  *
07093  *     "hello".center(4)         #=> "hello"
07094  *     "hello".center(20)        #=> "       hello        "
07095  *     "hello".center(20, '123') #=> "1231231hello12312312"
07096  */
07097 
07098 static VALUE
07099 rb_str_center(int argc, VALUE *argv, VALUE str)
07100 {
07101     return rb_str_justify(argc, argv, str, 'c');
07102 }
07103 
07104 /*
07105  *  call-seq:
07106  *     str.partition(sep)              -> [head, sep, tail]
07107  *     str.partition(regexp)           -> [head, match, tail]
07108  *
07109  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
07110  *  and returns the part before it, the match, and the part
07111  *  after it.
07112  *  If it is not found, returns two empty strings and <i>str</i>.
07113  *
07114  *     "hello".partition("l")         #=> ["he", "l", "lo"]
07115  *     "hello".partition("x")         #=> ["hello", "", ""]
07116  *     "hello".partition(/.l/)        #=> ["h", "el", "lo"]
07117  */
07118 
07119 static VALUE
07120 rb_str_partition(VALUE str, VALUE sep)
07121 {
07122     long pos;
07123     int regex = FALSE;
07124 
07125     if (TYPE(sep) == T_REGEXP) {
07126         pos = rb_reg_search(sep, str, 0, 0);
07127         regex = TRUE;
07128     }
07129     else {
07130         VALUE tmp;
07131 
07132         tmp = rb_check_string_type(sep);
07133         if (NIL_P(tmp)) {
07134             rb_raise(rb_eTypeError, "type mismatch: %s given",
07135                      rb_obj_classname(sep));
07136         }
07137         sep = tmp;
07138         pos = rb_str_index(str, sep, 0);
07139     }
07140     if (pos < 0) {
07141       failed:
07142         return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07143     }
07144     if (regex) {
07145         sep = rb_str_subpat(str, sep, INT2FIX(0));
07146         if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07147     }
07148     return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07149                           sep,
07150                           rb_str_subseq(str, pos+RSTRING_LEN(sep),
07151                                              RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07152 }
07153 
07154 /*
07155  *  call-seq:
07156  *     str.rpartition(sep)             -> [head, sep, tail]
07157  *     str.rpartition(regexp)          -> [head, match, tail]
07158  *
07159  *  Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
07160  *  of the string, and returns the part before it, the match, and the part
07161  *  after it.
07162  *  If it is not found, returns two empty strings and <i>str</i>.
07163  *
07164  *     "hello".rpartition("l")         #=> ["hel", "l", "o"]
07165  *     "hello".rpartition("x")         #=> ["", "", "hello"]
07166  *     "hello".rpartition(/.l/)        #=> ["he", "ll", "o"]
07167  */
07168 
07169 static VALUE
07170 rb_str_rpartition(VALUE str, VALUE sep)
07171 {
07172     long pos = RSTRING_LEN(str);
07173     int regex = FALSE;
07174 
07175     if (TYPE(sep) == T_REGEXP) {
07176         pos = rb_reg_search(sep, str, pos, 1);
07177         regex = TRUE;
07178     }
07179     else {
07180         VALUE tmp;
07181 
07182         tmp = rb_check_string_type(sep);
07183         if (NIL_P(tmp)) {
07184             rb_raise(rb_eTypeError, "type mismatch: %s given",
07185                      rb_obj_classname(sep));
07186         }
07187         sep = tmp;
07188         pos = rb_str_sublen(str, pos);
07189         pos = rb_str_rindex(str, sep, pos);
07190     }
07191     if (pos < 0) {
07192         return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07193     }
07194     if (regex) {
07195         sep = rb_reg_nth_match(0, rb_backref_get());
07196     }
07197     return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07198                           sep,
07199                           rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07200 }
07201 
07202 /*
07203  *  call-seq:
07204  *     str.start_with?([prefix]+)   -> true or false
07205  *
07206  *  Returns true if <i>str</i> starts with one of the prefixes given.
07207  *
07208  *    p "hello".start_with?("hell")               #=> true
07209  *
07210  *    # returns true if one of the prefixes matches.
07211  *    p "hello".start_with?("heaven", "hell")     #=> true
07212  *    p "hello".start_with?("heaven", "paradise") #=> false
07213  *
07214  *
07215  *
07216  */
07217 
07218 static VALUE
07219 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07220 {
07221     int i;
07222 
07223     for (i=0; i<argc; i++) {
07224         VALUE tmp = rb_check_string_type(argv[i]);
07225         if (NIL_P(tmp)) continue;
07226         rb_enc_check(str, tmp);
07227         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07228         if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07229             return Qtrue;
07230     }
07231     return Qfalse;
07232 }
07233 
07234 /*
07235  *  call-seq:
07236  *     str.end_with?([suffix]+)   -> true or false
07237  *
07238  *  Returns true if <i>str</i> ends with one of the suffixes given.
07239  */
07240 
07241 static VALUE
07242 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07243 {
07244     int i;
07245     char *p, *s, *e;
07246     rb_encoding *enc;
07247 
07248     for (i=0; i<argc; i++) {
07249         VALUE tmp = rb_check_string_type(argv[i]);
07250         if (NIL_P(tmp)) continue;
07251         enc = rb_enc_check(str, tmp);
07252         if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07253         p = RSTRING_PTR(str);
07254         e = p + RSTRING_LEN(str);
07255         s = e - RSTRING_LEN(tmp);
07256         if (rb_enc_left_char_head(p, s, e, enc) != s)
07257             continue;
07258         if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07259             return Qtrue;
07260     }
07261     return Qfalse;
07262 }
07263 
07264 void
07265 rb_str_setter(VALUE val, ID id, VALUE *var)
07266 {
07267     if (!NIL_P(val) && TYPE(val) != T_STRING) {
07268         rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07269     }
07270     *var = val;
07271 }
07272 
07273 
07274 /*
07275  *  call-seq:
07276  *     str.force_encoding(encoding)   -> str
07277  *
07278  *  Changes the encoding to +encoding+ and returns self.
07279  */
07280 
07281 static VALUE
07282 rb_str_force_encoding(VALUE str, VALUE enc)
07283 {
07284     str_modifiable(str);
07285     rb_enc_associate(str, rb_to_encoding(enc));
07286     ENC_CODERANGE_CLEAR(str);
07287     return str;
07288 }
07289 
07290 /*
07291  *  call-seq:
07292  *     str.valid_encoding?  -> true or false
07293  *
07294  *  Returns true for a string which encoded correctly.
07295  *
07296  *    "\xc2\xa1".force_encoding("UTF-8").valid_encoding?  #=> true
07297  *    "\xc2".force_encoding("UTF-8").valid_encoding?      #=> false
07298  *    "\x80".force_encoding("UTF-8").valid_encoding?      #=> false
07299  */
07300 
07301 static VALUE
07302 rb_str_valid_encoding_p(VALUE str)
07303 {
07304     int cr = rb_enc_str_coderange(str);
07305 
07306     return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07307 }
07308 
07309 /*
07310  *  call-seq:
07311  *     str.ascii_only?  -> true or false
07312  *
07313  *  Returns true for a string which has only ASCII characters.
07314  *
07315  *    "abc".force_encoding("UTF-8").ascii_only?          #=> true
07316  *    "abc\u{6666}".force_encoding("UTF-8").ascii_only?  #=> false
07317  */
07318 
07319 static VALUE
07320 rb_str_is_ascii_only_p(VALUE str)
07321 {
07322     int cr = rb_enc_str_coderange(str);
07323 
07324     return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07325 }
07326 
07341 VALUE
07342 rb_str_ellipsize(VALUE str, long len)
07343 {
07344     static const char ellipsis[] = "...";
07345     const long ellipsislen = sizeof(ellipsis) - 1;
07346     rb_encoding *const enc = rb_enc_get(str);
07347     const long blen = RSTRING_LEN(str);
07348     const char *const p = RSTRING_PTR(str), *e = p + blen;
07349     VALUE estr, ret = 0;
07350 
07351     if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07352     if (len * rb_enc_mbminlen(enc) >= blen ||
07353         (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07354         ret = str;
07355     }
07356     else if (len <= ellipsislen ||
07357              !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07358         if (rb_enc_asciicompat(enc)) {
07359             ret = rb_str_new_with_class(str, ellipsis, len);
07360             rb_enc_associate(ret, enc);
07361         }
07362         else {
07363             estr = rb_usascii_str_new(ellipsis, len);
07364             ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07365         }
07366     }
07367     else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07368         rb_str_cat(ret, ellipsis, ellipsislen);
07369     }
07370     else {
07371         estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07372                              rb_enc_from_encoding(enc), 0, Qnil);
07373         rb_str_append(ret, estr);
07374     }
07375     return ret;
07376 }
07377 
07378 /**********************************************************************
07379  * Document-class: Symbol
07380  *
07381  *  <code>Symbol</code> objects represent names and some strings
07382  *  inside the Ruby
07383  *  interpreter. They are generated using the <code>:name</code> and
07384  *  <code>:"string"</code> literals
07385  *  syntax, and by the various <code>to_sym</code> methods. The same
07386  *  <code>Symbol</code> object will be created for a given name or string
07387  *  for the duration of a program's execution, regardless of the context
07388  *  or meaning of that name. Thus if <code>Fred</code> is a constant in
07389  *  one context, a method in another, and a class in a third, the
07390  *  <code>Symbol</code> <code>:Fred</code> will be the same object in
07391  *  all three contexts.
07392  *
07393  *     module One
07394  *       class Fred
07395  *       end
07396  *       $f1 = :Fred
07397  *     end
07398  *     module Two
07399  *       Fred = 1
07400  *       $f2 = :Fred
07401  *     end
07402  *     def Fred()
07403  *     end
07404  *     $f3 = :Fred
07405  *     $f1.object_id   #=> 2514190
07406  *     $f2.object_id   #=> 2514190
07407  *     $f3.object_id   #=> 2514190
07408  *
07409  */
07410 
07411 
07412 /*
07413  *  call-seq:
07414  *     sym == obj   -> true or false
07415  *
07416  *  Equality---If <i>sym</i> and <i>obj</i> are exactly the same
07417  *  symbol, returns <code>true</code>.
07418  */
07419 
07420 static VALUE
07421 sym_equal(VALUE sym1, VALUE sym2)
07422 {
07423     if (sym1 == sym2) return Qtrue;
07424     return Qfalse;
07425 }
07426 
07427 
07428 static int
07429 sym_printable(const char *s, const char *send, rb_encoding *enc)
07430 {
07431     while (s < send) {
07432         int n;
07433         int c = rb_enc_codepoint_len(s, send, &n, enc);
07434 
07435         if (!rb_enc_isprint(c, enc)) return FALSE;
07436         s += n;
07437     }
07438     return TRUE;
07439 }
07440 
07441 /*
07442  *  call-seq:
07443  *     sym.inspect    -> string
07444  *
07445  *  Returns the representation of <i>sym</i> as a symbol literal.
07446  *
07447  *     :fred.inspect   #=> ":fred"
07448  */
07449 
07450 static VALUE
07451 sym_inspect(VALUE sym)
07452 {
07453     VALUE str;
07454     ID id = SYM2ID(sym);
07455     rb_encoding *enc;
07456     const char *ptr;
07457     long len;
07458     char *dest;
07459     rb_encoding *resenc = rb_default_internal_encoding();
07460 
07461     if (resenc == NULL) resenc = rb_default_external_encoding();
07462     sym = rb_id2str(id);
07463     enc = STR_ENC_GET(sym);
07464     ptr = RSTRING_PTR(sym);
07465     len = RSTRING_LEN(sym);
07466     if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07467         !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07468         str = rb_str_inspect(sym);
07469         len = RSTRING_LEN(str);
07470         rb_str_resize(str, len + 1);
07471         dest = RSTRING_PTR(str);
07472         memmove(dest + 1, dest, len);
07473         dest[0] = ':';
07474     }
07475     else {
07476         char *dest;
07477         str = rb_enc_str_new(0, len + 1, enc);
07478         dest = RSTRING_PTR(str);
07479         dest[0] = ':';
07480         memcpy(dest + 1, ptr, len);
07481     }
07482     return str;
07483 }
07484 
07485 
07486 /*
07487  *  call-seq:
07488  *     sym.id2name   -> string
07489  *     sym.to_s      -> string
07490  *
07491  *  Returns the name or string corresponding to <i>sym</i>.
07492  *
07493  *     :fred.id2name   #=> "fred"
07494  */
07495 
07496 
07497 VALUE
07498 rb_sym_to_s(VALUE sym)
07499 {
07500     ID id = SYM2ID(sym);
07501 
07502     return str_new3(rb_cString, rb_id2str(id));
07503 }
07504 
07505 
07506 /*
07507  * call-seq:
07508  *   sym.to_sym   -> sym
07509  *   sym.intern   -> sym
07510  *
07511  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
07512  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
07513  * in this case.
07514  */
07515 
07516 static VALUE
07517 sym_to_sym(VALUE sym)
07518 {
07519     return sym;
07520 }
07521 
07522 static VALUE
07523 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc)
07524 {
07525     VALUE obj;
07526 
07527     if (argc < 1) {
07528         rb_raise(rb_eArgError, "no receiver given");
07529     }
07530     obj = argv[0];
07531     return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc);
07532 }
07533 
07534 /*
07535  * call-seq:
07536  *   sym.to_proc
07537  *
07538  * Returns a _Proc_ object which respond to the given method by _sym_.
07539  *
07540  *   (1..3).collect(&:to_s)  #=> ["1", "2", "3"]
07541  */
07542 
07543 static VALUE
07544 sym_to_proc(VALUE sym)
07545 {
07546     static VALUE sym_proc_cache = Qfalse;
07547     enum {SYM_PROC_CACHE_SIZE = 67};
07548     VALUE proc;
07549     long id, index;
07550     VALUE *aryp;
07551 
07552     if (!sym_proc_cache) {
07553         sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07554         rb_gc_register_mark_object(sym_proc_cache);
07555         rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07556     }
07557 
07558     id = SYM2ID(sym);
07559     index = (id % SYM_PROC_CACHE_SIZE) << 1;
07560 
07561     aryp = RARRAY_PTR(sym_proc_cache);
07562     if (aryp[index] == sym) {
07563         return aryp[index + 1];
07564     }
07565     else {
07566         proc = rb_proc_new(sym_call, (VALUE)id);
07567         aryp[index] = sym;
07568         aryp[index + 1] = proc;
07569         return proc;
07570     }
07571 }
07572 
07573 /*
07574  * call-seq:
07575  *
07576  *   sym.succ
07577  *
07578  * Same as <code>sym.to_s.succ.intern</code>.
07579  */
07580 
07581 static VALUE
07582 sym_succ(VALUE sym)
07583 {
07584     return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07585 }
07586 
07587 /*
07588  * call-seq:
07589  *
07590  *   str <=> other       -> -1, 0, +1 or nil
07591  *
07592  * Compares _sym_ with _other_ in string form.
07593  */
07594 
07595 static VALUE
07596 sym_cmp(VALUE sym, VALUE other)
07597 {
07598     if (!SYMBOL_P(other)) {
07599         return Qnil;
07600     }
07601     return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07602 }
07603 
07604 /*
07605  * call-seq:
07606  *
07607  *   sym.casecmp(other)  -> -1, 0, +1 or nil
07608  *
07609  * Case-insensitive version of <code>Symbol#<=></code>.
07610  */
07611 
07612 static VALUE
07613 sym_casecmp(VALUE sym, VALUE other)
07614 {
07615     if (!SYMBOL_P(other)) {
07616         return Qnil;
07617     }
07618     return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07619 }
07620 
07621 /*
07622  * call-seq:
07623  *   sym =~ obj   -> fixnum or nil
07624  *
07625  * Returns <code>sym.to_s =~ obj</code>.
07626  */
07627 
07628 static VALUE
07629 sym_match(VALUE sym, VALUE other)
07630 {
07631     return rb_str_match(rb_sym_to_s(sym), other);
07632 }
07633 
07634 /*
07635  * call-seq:
07636  *   sym[idx]      -> char
07637  *   sym[b, n]     -> char
07638  *
07639  * Returns <code>sym.to_s[]</code>.
07640  */
07641 
07642 static VALUE
07643 sym_aref(int argc, VALUE *argv, VALUE sym)
07644 {
07645     return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07646 }
07647 
07648 /*
07649  * call-seq:
07650  *   sym.length    -> integer
07651  *
07652  * Same as <code>sym.to_s.length</code>.
07653  */
07654 
07655 static VALUE
07656 sym_length(VALUE sym)
07657 {
07658     return rb_str_length(rb_id2str(SYM2ID(sym)));
07659 }
07660 
07661 /*
07662  * call-seq:
07663  *   sym.empty?   -> true or false
07664  *
07665  * Returns that _sym_ is :"" or not.
07666  */
07667 
07668 static VALUE
07669 sym_empty(VALUE sym)
07670 {
07671     return rb_str_empty(rb_id2str(SYM2ID(sym)));
07672 }
07673 
07674 /*
07675  * call-seq:
07676  *   sym.upcase    -> symbol
07677  *
07678  * Same as <code>sym.to_s.upcase.intern</code>.
07679  */
07680 
07681 static VALUE
07682 sym_upcase(VALUE sym)
07683 {
07684     return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07685 }
07686 
07687 /*
07688  * call-seq:
07689  *   sym.downcase  -> symbol
07690  *
07691  * Same as <code>sym.to_s.downcase.intern</code>.
07692  */
07693 
07694 static VALUE
07695 sym_downcase(VALUE sym)
07696 {
07697     return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07698 }
07699 
07700 /*
07701  * call-seq:
07702  *   sym.capitalize  -> symbol
07703  *
07704  * Same as <code>sym.to_s.capitalize.intern</code>.
07705  */
07706 
07707 static VALUE
07708 sym_capitalize(VALUE sym)
07709 {
07710     return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07711 }
07712 
07713 /*
07714  * call-seq:
07715  *   sym.swapcase  -> symbol
07716  *
07717  * Same as <code>sym.to_s.swapcase.intern</code>.
07718  */
07719 
07720 static VALUE
07721 sym_swapcase(VALUE sym)
07722 {
07723     return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07724 }
07725 
07726 /*
07727  * call-seq:
07728  *   sym.encoding   -> encoding
07729  *
07730  * Returns the Encoding object that represents the encoding of _sym_.
07731  */
07732 
07733 static VALUE
07734 sym_encoding(VALUE sym)
07735 {
07736     return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07737 }
07738 
07739 ID
07740 rb_to_id(VALUE name)
07741 {
07742     VALUE tmp;
07743 
07744     switch (TYPE(name)) {
07745       default:
07746         tmp = rb_check_string_type(name);
07747         if (NIL_P(tmp)) {
07748             tmp = rb_inspect(name);
07749             rb_raise(rb_eTypeError, "%s is not a symbol",
07750                      RSTRING_PTR(tmp));
07751         }
07752         name = tmp;
07753         /* fall through */
07754       case T_STRING:
07755         name = rb_str_intern(name);
07756         /* fall through */
07757       case T_SYMBOL:
07758         return SYM2ID(name);
07759     }
07760     return Qnil; /* not reached */
07761 }
07762 
07763 /*
07764  *  A <code>String</code> object holds and manipulates an arbitrary sequence of
07765  *  bytes, typically representing characters. String objects may be created
07766  *  using <code>String::new</code> or as literals.
07767  *
07768  *  Because of aliasing issues, users of strings should be aware of the methods
07769  *  that modify the contents of a <code>String</code> object.  Typically,
07770  *  methods with names ending in ``!'' modify their receiver, while those
07771  *  without a ``!'' return a new <code>String</code>.  However, there are
07772  *  exceptions, such as <code>String#[]=</code>.
07773  *
07774  */
07775 
07776 void
07777 Init_String(void)
07778 {
07779 #undef rb_intern
07780 #define rb_intern(str) rb_intern_const(str)
07781 
07782     rb_cString  = rb_define_class("String", rb_cObject);
07783     rb_include_module(rb_cString, rb_mComparable);
07784     rb_define_alloc_func(rb_cString, str_alloc);
07785     rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07786     rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07787     rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07788     rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07789     rb_define_method(rb_cString, "==", rb_str_equal, 1);
07790     rb_define_method(rb_cString, "===", rb_str_equal, 1);
07791     rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07792     rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07793     rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07794     rb_define_method(rb_cString, "+", rb_str_plus, 1);
07795     rb_define_method(rb_cString, "*", rb_str_times, 1);
07796     rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07797     rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07798     rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07799     rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07800     rb_define_method(rb_cString, "length", rb_str_length, 0);
07801     rb_define_method(rb_cString, "size", rb_str_length, 0);
07802     rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07803     rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07804     rb_define_method(rb_cString, "=~", rb_str_match, 1);
07805     rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07806     rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07807     rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07808     rb_define_method(rb_cString, "next", rb_str_succ, 0);
07809     rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07810     rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07811     rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07812     rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07813     rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07814     rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07815     rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07816     rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07817     rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07818     rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
07819 
07820     rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07821     rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07822     rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07823     rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07824     rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07825     rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07826 
07827     rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07828     rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07829     rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07830     rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07831 
07832     rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07833     rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07834     rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07835     rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07836 
07837     rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07838     rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07839     rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07840     rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07841     rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07842     rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07843     rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07844     rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07845     rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07846     rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07847     rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07848     rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
07849     rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07850     rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07851     rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07852     rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07853 
07854     rb_define_method(rb_cString, "include?", rb_str_include, 1);
07855     rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07856     rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07857 
07858     rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07859 
07860     rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07861     rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07862     rb_define_method(rb_cString, "center", rb_str_center, -1);
07863 
07864     rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07865     rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07866     rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07867     rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07868     rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07869     rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07870     rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07871 
07872     rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07873     rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07874     rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07875     rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07876     rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07877     rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07878     rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07879 
07880     rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07881     rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07882     rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07883     rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07884     rb_define_method(rb_cString, "count", rb_str_count, -1);
07885 
07886     rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07887     rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07888     rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07889     rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07890 
07891     rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07892     rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07893     rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07894     rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07895 
07896     rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07897 
07898     rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07899     rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07900 
07901     rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07902     rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07903 
07904     rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
07905     rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07906     rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07907     rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07908 
07909     id_to_s = rb_intern("to_s");
07910 
07911     rb_fs = Qnil;
07912     rb_define_variable("$;", &rb_fs);
07913     rb_define_variable("$-F", &rb_fs);
07914 
07915     rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07916     rb_include_module(rb_cSymbol, rb_mComparable);
07917     rb_undef_alloc_func(rb_cSymbol);
07918     rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07919     rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */
07920 
07921     rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07922     rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07923     rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07924     rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07925     rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07926     rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07927     rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07928     rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07929     rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07930     rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07931 
07932     rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07933     rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07934     rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07935 
07936     rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07937     rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07938     rb_define_method(rb_cSymbol, "length", sym_length, 0);
07939     rb_define_method(rb_cSymbol, "size", sym_length, 0);
07940     rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07941     rb_define_method(rb_cSymbol, "match", sym_match, 1);
07942 
07943     rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07944     rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07945     rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07946     rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07947 
07948     rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07949 }
07950