|
Ruby
1.9.3p537(2014-02-19revision0)
|
00001 /********************************************************************** 00002 00003 string.c - 00004 00005 $Author$ 00006 created at: Mon Aug 9 17:12:58 JST 1993 00007 00008 Copyright (C) 1993-2007 Yukihiro Matsumoto 00009 Copyright (C) 2000 Network Applied Communication Laboratory, Inc. 00010 Copyright (C) 2000 Information-technology Promotion Agency, Japan 00011 00012 **********************************************************************/ 00013 00014 #include "ruby/ruby.h" 00015 #include "ruby/re.h" 00016 #include "ruby/encoding.h" 00017 #include "internal.h" 00018 #include <assert.h> 00019 00020 #define BEG(no) (regs->beg[(no)]) 00021 #define END(no) (regs->end[(no)]) 00022 00023 #include <math.h> 00024 #include <ctype.h> 00025 00026 #ifdef HAVE_UNISTD_H 00027 #include <unistd.h> 00028 #endif 00029 00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0])) 00031 00032 #undef rb_str_new_cstr 00033 #undef rb_tainted_str_new_cstr 00034 #undef rb_usascii_str_new_cstr 00035 #undef rb_external_str_new_cstr 00036 #undef rb_locale_str_new_cstr 00037 #undef rb_str_new2 00038 #undef rb_str_new3 00039 #undef rb_str_new4 00040 #undef rb_str_new5 00041 #undef rb_tainted_str_new2 00042 #undef rb_usascii_str_new2 00043 #undef rb_str_dup_frozen 00044 #undef rb_str_buf_new_cstr 00045 #undef rb_str_buf_new2 00046 #undef rb_str_buf_cat2 00047 #undef rb_str_cat2 00048 00049 static VALUE rb_str_clear(VALUE str); 00050 00051 VALUE rb_cString; 00052 VALUE rb_cSymbol; 00053 00054 #define RUBY_MAX_CHAR_LEN 16 00055 #define STR_TMPLOCK FL_USER7 00056 #define STR_NOEMBED FL_USER1 00057 #define STR_SHARED FL_USER2 /* = ELTS_SHARED */ 00058 #define STR_ASSOC FL_USER3 00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED) 00060 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC) 00061 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC) 00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC)) 00063 #define STR_UNSET_NOCAPA(s) do {\ 00064 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\ 00065 } while (0) 00066 00067 00068 #define STR_SET_NOEMBED(str) do {\ 00069 FL_SET((str), STR_NOEMBED);\ 00070 STR_SET_EMBED_LEN((str), 0);\ 00071 } while (0) 00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED) 00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED)) 00074 #define STR_SET_EMBED_LEN(str, n) do { \ 00075 long tmp_n = (n);\ 00076 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\ 00077 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\ 00078 } while (0) 00079 00080 #define STR_SET_LEN(str, n) do { \ 00081 if (STR_EMBED_P(str)) {\ 00082 STR_SET_EMBED_LEN((str), (n));\ 00083 }\ 00084 else {\ 00085 RSTRING(str)->as.heap.len = (n);\ 00086 }\ 00087 } while (0) 00088 00089 #define STR_DEC_LEN(str) do {\ 00090 if (STR_EMBED_P(str)) {\ 00091 long n = RSTRING_LEN(str);\ 00092 n--;\ 00093 STR_SET_EMBED_LEN((str), n);\ 00094 }\ 00095 else {\ 00096 RSTRING(str)->as.heap.len--;\ 00097 }\ 00098 } while (0) 00099 00100 #define RESIZE_CAPA(str,capacity) do {\ 00101 if (STR_EMBED_P(str)) {\ 00102 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\ 00103 char *tmp = ALLOC_N(char, (capacity)+1);\ 00104 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\ 00105 RSTRING(str)->as.heap.ptr = tmp;\ 00106 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\ 00107 STR_SET_NOEMBED(str);\ 00108 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00109 }\ 00110 }\ 00111 else {\ 00112 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\ 00113 if (!STR_NOCAPA_P(str))\ 00114 RSTRING(str)->as.heap.aux.capa = (capacity);\ 00115 }\ 00116 } while (0) 00117 00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) 00120 00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) 00122 00123 static inline int 00124 single_byte_optimizable(VALUE str) 00125 { 00126 rb_encoding *enc; 00127 00128 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */ 00129 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) 00130 return 1; 00131 00132 enc = STR_ENC_GET(str); 00133 if (rb_enc_mbmaxlen(enc) == 1) 00134 return 1; 00135 00136 /* Conservative. Possibly single byte. 00137 * "\xa1" in Shift_JIS for example. */ 00138 return 0; 00139 } 00140 00141 VALUE rb_fs; 00142 00143 static inline const char * 00144 search_nonascii(const char *p, const char *e) 00145 { 00146 #if SIZEOF_VALUE == 8 00147 # define NONASCII_MASK 0x8080808080808080ULL 00148 #elif SIZEOF_VALUE == 4 00149 # define NONASCII_MASK 0x80808080UL 00150 #endif 00151 #ifdef NONASCII_MASK 00152 if ((int)sizeof(VALUE) * 2 < e - p) { 00153 const VALUE *s, *t; 00154 const VALUE lowbits = sizeof(VALUE) - 1; 00155 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 00156 while (p < (const char *)s) { 00157 if (!ISASCII(*p)) 00158 return p; 00159 p++; 00160 } 00161 t = (const VALUE*)(~lowbits & (VALUE)e); 00162 while (s < t) { 00163 if (*s & NONASCII_MASK) { 00164 t = s; 00165 break; 00166 } 00167 s++; 00168 } 00169 p = (const char *)t; 00170 } 00171 #endif 00172 while (p < e) { 00173 if (!ISASCII(*p)) 00174 return p; 00175 p++; 00176 } 00177 return NULL; 00178 } 00179 00180 static int 00181 coderange_scan(const char *p, long len, rb_encoding *enc) 00182 { 00183 const char *e = p + len; 00184 00185 if (rb_enc_to_index(enc) == 0) { 00186 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00187 p = search_nonascii(p, e); 00188 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT; 00189 } 00190 00191 if (rb_enc_asciicompat(enc)) { 00192 p = search_nonascii(p, e); 00193 if (!p) { 00194 return ENC_CODERANGE_7BIT; 00195 } 00196 while (p < e) { 00197 int ret = rb_enc_precise_mbclen(p, e, enc); 00198 if (!MBCLEN_CHARFOUND_P(ret)) { 00199 return ENC_CODERANGE_BROKEN; 00200 } 00201 p += MBCLEN_CHARFOUND_LEN(ret); 00202 if (p < e) { 00203 p = search_nonascii(p, e); 00204 if (!p) { 00205 return ENC_CODERANGE_VALID; 00206 } 00207 } 00208 } 00209 if (e < p) { 00210 return ENC_CODERANGE_BROKEN; 00211 } 00212 return ENC_CODERANGE_VALID; 00213 } 00214 00215 while (p < e) { 00216 int ret = rb_enc_precise_mbclen(p, e, enc); 00217 00218 if (!MBCLEN_CHARFOUND_P(ret)) { 00219 return ENC_CODERANGE_BROKEN; 00220 } 00221 p += MBCLEN_CHARFOUND_LEN(ret); 00222 } 00223 if (e < p) { 00224 return ENC_CODERANGE_BROKEN; 00225 } 00226 return ENC_CODERANGE_VALID; 00227 } 00228 00229 long 00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr) 00231 { 00232 const char *p = s; 00233 00234 if (*cr == ENC_CODERANGE_BROKEN) 00235 return e - s; 00236 00237 if (rb_enc_to_index(enc) == 0) { 00238 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */ 00239 p = search_nonascii(p, e); 00240 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 00241 return e - s; 00242 } 00243 else if (rb_enc_asciicompat(enc)) { 00244 p = search_nonascii(p, e); 00245 if (!p) { 00246 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT; 00247 return e - s; 00248 } 00249 while (p < e) { 00250 int ret = rb_enc_precise_mbclen(p, e, enc); 00251 if (!MBCLEN_CHARFOUND_P(ret)) { 00252 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00253 return p - s; 00254 } 00255 p += MBCLEN_CHARFOUND_LEN(ret); 00256 if (p < e) { 00257 p = search_nonascii(p, e); 00258 if (!p) { 00259 *cr = ENC_CODERANGE_VALID; 00260 return e - s; 00261 } 00262 } 00263 } 00264 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00265 return p - s; 00266 } 00267 else { 00268 while (p < e) { 00269 int ret = rb_enc_precise_mbclen(p, e, enc); 00270 if (!MBCLEN_CHARFOUND_P(ret)) { 00271 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN; 00272 return p - s; 00273 } 00274 p += MBCLEN_CHARFOUND_LEN(ret); 00275 } 00276 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID; 00277 return p - s; 00278 } 00279 } 00280 00281 static inline void 00282 str_enc_copy(VALUE str1, VALUE str2) 00283 { 00284 rb_enc_set_index(str1, ENCODING_GET(str2)); 00285 } 00286 00287 static void 00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src) 00289 { 00290 /* this function is designed for copying encoding and coderange 00291 * from src to new string "dest" which is made from the part of src. 00292 */ 00293 str_enc_copy(dest, src); 00294 switch (ENC_CODERANGE(src)) { 00295 case ENC_CODERANGE_7BIT: 00296 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00297 break; 00298 case ENC_CODERANGE_VALID: 00299 if (!rb_enc_asciicompat(STR_ENC_GET(src)) || 00300 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest))) 00301 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00302 else 00303 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00304 break; 00305 default: 00306 if (RSTRING_LEN(dest) == 0) { 00307 if (!rb_enc_asciicompat(STR_ENC_GET(src))) 00308 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID); 00309 else 00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT); 00311 } 00312 break; 00313 } 00314 } 00315 00316 static void 00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) 00318 { 00319 str_enc_copy(dest, src); 00320 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); 00321 } 00322 00323 int 00324 rb_enc_str_coderange(VALUE str) 00325 { 00326 int cr = ENC_CODERANGE(str); 00327 00328 if (cr == ENC_CODERANGE_UNKNOWN) { 00329 rb_encoding *enc = STR_ENC_GET(str); 00330 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); 00331 ENC_CODERANGE_SET(str, cr); 00332 } 00333 return cr; 00334 } 00335 00336 int 00337 rb_enc_str_asciionly_p(VALUE str) 00338 { 00339 rb_encoding *enc = STR_ENC_GET(str); 00340 00341 if (!rb_enc_asciicompat(enc)) 00342 return FALSE; 00343 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) 00344 return TRUE; 00345 return FALSE; 00346 } 00347 00348 static inline void 00349 str_mod_check(VALUE s, const char *p, long len) 00350 { 00351 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){ 00352 rb_raise(rb_eRuntimeError, "string modified"); 00353 } 00354 } 00355 00356 size_t 00357 rb_str_capacity(VALUE str) 00358 { 00359 if (STR_EMBED_P(str)) { 00360 return RSTRING_EMBED_LEN_MAX; 00361 } 00362 else if (STR_NOCAPA_P(str)) { 00363 return RSTRING(str)->as.heap.len; 00364 } 00365 else { 00366 return RSTRING(str)->as.heap.aux.capa; 00367 } 00368 } 00369 00370 static inline VALUE 00371 str_alloc(VALUE klass) 00372 { 00373 NEWOBJ(str, struct RString); 00374 OBJSETUP(str, klass, T_STRING); 00375 00376 str->as.heap.ptr = 0; 00377 str->as.heap.len = 0; 00378 str->as.heap.aux.capa = 0; 00379 00380 return (VALUE)str; 00381 } 00382 00383 static VALUE 00384 str_new(VALUE klass, const char *ptr, long len) 00385 { 00386 VALUE str; 00387 00388 if (len < 0) { 00389 rb_raise(rb_eArgError, "negative string size (or size too big)"); 00390 } 00391 00392 str = str_alloc(klass); 00393 if (len > RSTRING_EMBED_LEN_MAX) { 00394 RSTRING(str)->as.heap.aux.capa = len; 00395 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1); 00396 STR_SET_NOEMBED(str); 00397 } 00398 else if (len == 0) { 00399 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 00400 } 00401 if (ptr) { 00402 memcpy(RSTRING_PTR(str), ptr, len); 00403 } 00404 STR_SET_LEN(str, len); 00405 RSTRING_PTR(str)[len] = '\0'; 00406 return str; 00407 } 00408 00409 VALUE 00410 rb_str_new(const char *ptr, long len) 00411 { 00412 return str_new(rb_cString, ptr, len); 00413 } 00414 00415 VALUE 00416 rb_usascii_str_new(const char *ptr, long len) 00417 { 00418 VALUE str = rb_str_new(ptr, len); 00419 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00420 return str; 00421 } 00422 00423 VALUE 00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) 00425 { 00426 VALUE str = rb_str_new(ptr, len); 00427 rb_enc_associate(str, enc); 00428 return str; 00429 } 00430 00431 VALUE 00432 rb_str_new_cstr(const char *ptr) 00433 { 00434 if (!ptr) { 00435 rb_raise(rb_eArgError, "NULL pointer given"); 00436 } 00437 return rb_str_new(ptr, strlen(ptr)); 00438 } 00439 00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr)) 00441 #define rb_str_new2 rb_str_new_cstr 00442 00443 VALUE 00444 rb_usascii_str_new_cstr(const char *ptr) 00445 { 00446 VALUE str = rb_str_new2(ptr); 00447 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT); 00448 return str; 00449 } 00450 00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr)) 00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr 00453 00454 VALUE 00455 rb_tainted_str_new(const char *ptr, long len) 00456 { 00457 VALUE str = rb_str_new(ptr, len); 00458 00459 OBJ_TAINT(str); 00460 return str; 00461 } 00462 00463 VALUE 00464 rb_tainted_str_new_cstr(const char *ptr) 00465 { 00466 VALUE str = rb_str_new2(ptr); 00467 00468 OBJ_TAINT(str); 00469 return str; 00470 } 00471 00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr)) 00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr 00474 00475 VALUE 00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) 00477 { 00478 rb_econv_t *ec; 00479 rb_econv_result_t ret; 00480 long len; 00481 VALUE newstr; 00482 const unsigned char *sp; 00483 unsigned char *dp; 00484 00485 if (!to) return str; 00486 if (from == to) return str; 00487 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) || 00488 to == rb_ascii8bit_encoding()) { 00489 if (STR_ENC_GET(str) != to) { 00490 str = rb_str_dup(str); 00491 rb_enc_associate(str, to); 00492 } 00493 return str; 00494 } 00495 00496 len = RSTRING_LEN(str); 00497 newstr = rb_str_new(0, len); 00498 00499 retry: 00500 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts); 00501 if (!ec) return str; 00502 00503 sp = (unsigned char*)RSTRING_PTR(str); 00504 dp = (unsigned char*)RSTRING_PTR(newstr); 00505 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str), 00506 &dp, (unsigned char*)RSTRING_END(newstr), 0); 00507 rb_econv_close(ec); 00508 switch (ret) { 00509 case econv_destination_buffer_full: 00510 /* destination buffer short */ 00511 len = len < 2 ? 2 : len * 2; 00512 rb_str_resize(newstr, len); 00513 goto retry; 00514 00515 case econv_finished: 00516 len = dp - (unsigned char*)RSTRING_PTR(newstr); 00517 rb_str_set_len(newstr, len); 00518 rb_enc_associate(newstr, to); 00519 return newstr; 00520 00521 default: 00522 /* some error, return original */ 00523 return str; 00524 } 00525 } 00526 00527 VALUE 00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to) 00529 { 00530 return rb_str_conv_enc_opts(str, from, to, 0, Qnil); 00531 } 00532 00533 VALUE 00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc) 00535 { 00536 VALUE str; 00537 00538 str = rb_tainted_str_new(ptr, len); 00539 if (eenc == rb_usascii_encoding() && 00540 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) { 00541 rb_enc_associate(str, rb_ascii8bit_encoding()); 00542 return str; 00543 } 00544 rb_enc_associate(str, eenc); 00545 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding()); 00546 } 00547 00548 VALUE 00549 rb_external_str_new(const char *ptr, long len) 00550 { 00551 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding()); 00552 } 00553 00554 VALUE 00555 rb_external_str_new_cstr(const char *ptr) 00556 { 00557 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding()); 00558 } 00559 00560 VALUE 00561 rb_locale_str_new(const char *ptr, long len) 00562 { 00563 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding()); 00564 } 00565 00566 VALUE 00567 rb_locale_str_new_cstr(const char *ptr) 00568 { 00569 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding()); 00570 } 00571 00572 VALUE 00573 rb_filesystem_str_new(const char *ptr, long len) 00574 { 00575 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding()); 00576 } 00577 00578 VALUE 00579 rb_filesystem_str_new_cstr(const char *ptr) 00580 { 00581 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding()); 00582 } 00583 00584 VALUE 00585 rb_str_export(VALUE str) 00586 { 00587 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding()); 00588 } 00589 00590 VALUE 00591 rb_str_export_locale(VALUE str) 00592 { 00593 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding()); 00594 } 00595 00596 VALUE 00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc) 00598 { 00599 return rb_str_conv_enc(str, STR_ENC_GET(str), enc); 00600 } 00601 00602 static VALUE 00603 str_replace_shared(VALUE str2, VALUE str) 00604 { 00605 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) { 00606 STR_SET_EMBED(str2); 00607 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1); 00608 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str)); 00609 } 00610 else { 00611 str = rb_str_new_frozen(str); 00612 FL_SET(str2, STR_NOEMBED); 00613 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00614 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00615 RSTRING(str2)->as.heap.aux.shared = str; 00616 FL_SET(str2, ELTS_SHARED); 00617 } 00618 rb_enc_cr_str_exact_copy(str2, str); 00619 00620 return str2; 00621 } 00622 00623 static VALUE 00624 str_new_shared(VALUE klass, VALUE str) 00625 { 00626 return str_replace_shared(str_alloc(klass), str); 00627 } 00628 00629 static VALUE 00630 str_new3(VALUE klass, VALUE str) 00631 { 00632 return str_new_shared(klass, str); 00633 } 00634 00635 VALUE 00636 rb_str_new_shared(VALUE str) 00637 { 00638 VALUE str2 = str_new3(rb_obj_class(str), str); 00639 00640 OBJ_INFECT(str2, str); 00641 return str2; 00642 } 00643 00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str)) 00645 #define rb_str_new3 rb_str_new_shared 00646 00647 static VALUE 00648 str_new4(VALUE klass, VALUE str) 00649 { 00650 VALUE str2; 00651 00652 str2 = str_alloc(klass); 00653 STR_SET_NOEMBED(str2); 00654 RSTRING(str2)->as.heap.len = RSTRING_LEN(str); 00655 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str); 00656 if (STR_SHARED_P(str)) { 00657 VALUE shared = RSTRING(str)->as.heap.aux.shared; 00658 assert(OBJ_FROZEN(shared)); 00659 FL_SET(str2, ELTS_SHARED); 00660 RSTRING(str2)->as.heap.aux.shared = shared; 00661 } 00662 else { 00663 FL_SET(str, ELTS_SHARED); 00664 RSTRING(str)->as.heap.aux.shared = str2; 00665 } 00666 rb_enc_cr_str_exact_copy(str2, str); 00667 OBJ_INFECT(str2, str); 00668 return str2; 00669 } 00670 00671 VALUE 00672 rb_str_new_frozen(VALUE orig) 00673 { 00674 VALUE klass, str; 00675 00676 if (OBJ_FROZEN(orig)) return orig; 00677 klass = rb_obj_class(orig); 00678 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) { 00679 long ofs; 00680 assert(OBJ_FROZEN(str)); 00681 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig); 00682 if ((ofs > 0) || (klass != RBASIC(str)->klass) || 00683 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) || 00684 ENCODING_GET(str) != ENCODING_GET(orig)) { 00685 str = str_new3(klass, str); 00686 RSTRING(str)->as.heap.ptr += ofs; 00687 RSTRING(str)->as.heap.len -= ofs; 00688 rb_enc_cr_str_exact_copy(str, orig); 00689 OBJ_INFECT(str, orig); 00690 } 00691 } 00692 else if (STR_EMBED_P(orig)) { 00693 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig)); 00694 rb_enc_cr_str_exact_copy(str, orig); 00695 OBJ_INFECT(str, orig); 00696 } 00697 else if (STR_ASSOC_P(orig)) { 00698 VALUE assoc = RSTRING(orig)->as.heap.aux.shared; 00699 FL_UNSET(orig, STR_ASSOC); 00700 str = str_new4(klass, orig); 00701 FL_SET(str, STR_ASSOC); 00702 RSTRING(str)->as.heap.aux.shared = assoc; 00703 } 00704 else { 00705 str = str_new4(klass, orig); 00706 } 00707 OBJ_FREEZE(str); 00708 return str; 00709 } 00710 00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig)) 00712 #define rb_str_new4 rb_str_new_frozen 00713 00714 VALUE 00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len) 00716 { 00717 return str_new(rb_obj_class(obj), ptr, len); 00718 } 00719 00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len), 00721 rb_str_new_with_class, (obj, ptr, len)) 00722 #define rb_str_new5 rb_str_new_with_class 00723 00724 static VALUE 00725 str_new_empty(VALUE str) 00726 { 00727 VALUE v = rb_str_new5(str, 0, 0); 00728 rb_enc_copy(v, str); 00729 OBJ_INFECT(v, str); 00730 return v; 00731 } 00732 00733 #define STR_BUF_MIN_SIZE 128 00734 00735 VALUE 00736 rb_str_buf_new(long capa) 00737 { 00738 VALUE str = str_alloc(rb_cString); 00739 00740 if (capa < STR_BUF_MIN_SIZE) { 00741 capa = STR_BUF_MIN_SIZE; 00742 } 00743 FL_SET(str, STR_NOEMBED); 00744 RSTRING(str)->as.heap.aux.capa = capa; 00745 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1); 00746 RSTRING(str)->as.heap.ptr[0] = '\0'; 00747 00748 return str; 00749 } 00750 00751 VALUE 00752 rb_str_buf_new_cstr(const char *ptr) 00753 { 00754 VALUE str; 00755 long len = strlen(ptr); 00756 00757 str = rb_str_buf_new(len); 00758 rb_str_buf_cat(str, ptr, len); 00759 00760 return str; 00761 } 00762 00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr)) 00764 #define rb_str_buf_new2 rb_str_buf_new_cstr 00765 00766 VALUE 00767 rb_str_tmp_new(long len) 00768 { 00769 return str_new(0, 0, len); 00770 } 00771 00772 void * 00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len) 00774 { 00775 VALUE s = rb_str_tmp_new(len); 00776 *store = s; 00777 return RSTRING_PTR(s); 00778 } 00779 00780 void 00781 rb_free_tmp_buffer(volatile VALUE *store) 00782 { 00783 VALUE s = *store; 00784 *store = 0; 00785 if (s) rb_str_clear(s); 00786 } 00787 00788 void 00789 rb_str_free(VALUE str) 00790 { 00791 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00792 xfree(RSTRING(str)->as.heap.ptr); 00793 } 00794 } 00795 00796 RUBY_FUNC_EXPORTED size_t 00797 rb_str_memsize(VALUE str) 00798 { 00799 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) { 00800 return RSTRING(str)->as.heap.aux.capa; 00801 } 00802 else { 00803 return 0; 00804 } 00805 } 00806 00807 VALUE 00808 rb_str_to_str(VALUE str) 00809 { 00810 return rb_convert_type(str, T_STRING, "String", "to_str"); 00811 } 00812 00813 static inline void str_discard(VALUE str); 00814 00815 void 00816 rb_str_shared_replace(VALUE str, VALUE str2) 00817 { 00818 rb_encoding *enc; 00819 int cr; 00820 if (str == str2) return; 00821 enc = STR_ENC_GET(str2); 00822 cr = ENC_CODERANGE(str2); 00823 str_discard(str); 00824 OBJ_INFECT(str, str2); 00825 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) { 00826 STR_SET_EMBED(str); 00827 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1); 00828 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2)); 00829 rb_enc_associate(str, enc); 00830 ENC_CODERANGE_SET(str, cr); 00831 return; 00832 } 00833 STR_SET_NOEMBED(str); 00834 STR_UNSET_NOCAPA(str); 00835 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00836 RSTRING(str)->as.heap.len = RSTRING_LEN(str2); 00837 if (STR_NOCAPA_P(str2)) { 00838 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA); 00839 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared; 00840 } 00841 else { 00842 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa; 00843 } 00844 STR_SET_EMBED(str2); /* abandon str2 */ 00845 RSTRING_PTR(str2)[0] = 0; 00846 STR_SET_EMBED_LEN(str2, 0); 00847 rb_enc_associate(str, enc); 00848 ENC_CODERANGE_SET(str, cr); 00849 } 00850 00851 static ID id_to_s; 00852 00853 VALUE 00854 rb_obj_as_string(VALUE obj) 00855 { 00856 VALUE str; 00857 00858 if (TYPE(obj) == T_STRING) { 00859 return obj; 00860 } 00861 str = rb_funcall(obj, id_to_s, 0); 00862 if (TYPE(str) != T_STRING) 00863 return rb_any_to_s(obj); 00864 if (OBJ_TAINTED(obj)) OBJ_TAINT(str); 00865 return str; 00866 } 00867 00868 static VALUE 00869 str_replace(VALUE str, VALUE str2) 00870 { 00871 long len; 00872 00873 len = RSTRING_LEN(str2); 00874 if (STR_ASSOC_P(str2)) { 00875 str2 = rb_str_new4(str2); 00876 } 00877 if (STR_SHARED_P(str2)) { 00878 VALUE shared = RSTRING(str2)->as.heap.aux.shared; 00879 assert(OBJ_FROZEN(shared)); 00880 STR_SET_NOEMBED(str); 00881 RSTRING(str)->as.heap.len = len; 00882 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2); 00883 FL_SET(str, ELTS_SHARED); 00884 FL_UNSET(str, STR_ASSOC); 00885 RSTRING(str)->as.heap.aux.shared = shared; 00886 } 00887 else { 00888 str_replace_shared(str, str2); 00889 } 00890 00891 OBJ_INFECT(str, str2); 00892 rb_enc_cr_str_exact_copy(str, str2); 00893 return str; 00894 } 00895 00896 static VALUE 00897 str_duplicate(VALUE klass, VALUE str) 00898 { 00899 VALUE dup = str_alloc(klass); 00900 str_replace(dup, str); 00901 return dup; 00902 } 00903 00904 VALUE 00905 rb_str_dup(VALUE str) 00906 { 00907 return str_duplicate(rb_obj_class(str), str); 00908 } 00909 00910 VALUE 00911 rb_str_resurrect(VALUE str) 00912 { 00913 return str_replace(str_alloc(rb_cString), str); 00914 } 00915 00916 /* 00917 * call-seq: 00918 * String.new(str="") -> new_str 00919 * 00920 * Returns a new string object containing a copy of <i>str</i>. 00921 */ 00922 00923 static VALUE 00924 rb_str_init(int argc, VALUE *argv, VALUE str) 00925 { 00926 VALUE orig; 00927 00928 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1) 00929 rb_str_replace(str, orig); 00930 return str; 00931 } 00932 00933 static inline long 00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) 00935 { 00936 long c; 00937 const char *q; 00938 00939 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 00940 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 00941 } 00942 else if (rb_enc_asciicompat(enc)) { 00943 c = 0; 00944 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { 00945 while (p < e) { 00946 if (ISASCII(*p)) { 00947 q = search_nonascii(p, e); 00948 if (!q) 00949 return c + (e - p); 00950 c += q - p; 00951 p = q; 00952 } 00953 p += rb_enc_fast_mbclen(p, e, enc); 00954 c++; 00955 } 00956 } 00957 else { 00958 while (p < e) { 00959 if (ISASCII(*p)) { 00960 q = search_nonascii(p, e); 00961 if (!q) 00962 return c + (e - p); 00963 c += q - p; 00964 p = q; 00965 } 00966 p += rb_enc_mbclen(p, e, enc); 00967 c++; 00968 } 00969 } 00970 return c; 00971 } 00972 00973 for (c=0; p<e; c++) { 00974 p += rb_enc_mbclen(p, e, enc); 00975 } 00976 return c; 00977 } 00978 00979 long 00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) 00981 { 00982 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); 00983 } 00984 00985 long 00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) 00987 { 00988 long c; 00989 const char *q; 00990 int ret; 00991 00992 *cr = 0; 00993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 00994 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); 00995 } 00996 else if (rb_enc_asciicompat(enc)) { 00997 c = 0; 00998 while (p < e) { 00999 if (ISASCII(*p)) { 01000 q = search_nonascii(p, e); 01001 if (!q) { 01002 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01003 return c + (e - p); 01004 } 01005 c += q - p; 01006 p = q; 01007 } 01008 ret = rb_enc_precise_mbclen(p, e, enc); 01009 if (MBCLEN_CHARFOUND_P(ret)) { 01010 *cr |= ENC_CODERANGE_VALID; 01011 p += MBCLEN_CHARFOUND_LEN(ret); 01012 } 01013 else { 01014 *cr = ENC_CODERANGE_BROKEN; 01015 p++; 01016 } 01017 c++; 01018 } 01019 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01020 return c; 01021 } 01022 01023 for (c=0; p<e; c++) { 01024 ret = rb_enc_precise_mbclen(p, e, enc); 01025 if (MBCLEN_CHARFOUND_P(ret)) { 01026 *cr |= ENC_CODERANGE_VALID; 01027 p += MBCLEN_CHARFOUND_LEN(ret); 01028 } 01029 else { 01030 *cr = ENC_CODERANGE_BROKEN; 01031 if (p + rb_enc_mbminlen(enc) <= e) 01032 p += rb_enc_mbminlen(enc); 01033 else 01034 p = e; 01035 } 01036 } 01037 if (!*cr) *cr = ENC_CODERANGE_7BIT; 01038 return c; 01039 } 01040 01041 #ifdef NONASCII_MASK 01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) 01043 01044 /* 01045 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx 01046 * bit represention. (see http://en.wikipedia.org/wiki/UTF-8) 01047 * Therefore, following pseudo code can detect UTF-8 leading byte. 01048 * 01049 * if (!(byte & 0x80)) 01050 * byte |= 0x40; // turn on bit6 01051 * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. 01052 * 01053 * This function calculate every bytes in the argument word `s' 01054 * using the above logic concurrently. and gather every bytes result. 01055 */ 01056 static inline VALUE 01057 count_utf8_lead_bytes_with_word(const VALUE *s) 01058 { 01059 VALUE d = *s; 01060 01061 /* Transform into bit0 represent UTF-8 leading or not. */ 01062 d |= ~(d>>1); 01063 d >>= 6; 01064 d &= NONASCII_MASK >> 7; 01065 01066 /* Gather every bytes. */ 01067 d += (d>>8); 01068 d += (d>>16); 01069 #if SIZEOF_VALUE == 8 01070 d += (d>>32); 01071 #endif 01072 return (d&0xF); 01073 } 01074 #endif 01075 01076 static long 01077 str_strlen(VALUE str, rb_encoding *enc) 01078 { 01079 const char *p, *e; 01080 long n; 01081 int cr; 01082 01083 if (single_byte_optimizable(str)) return RSTRING_LEN(str); 01084 if (!enc) enc = STR_ENC_GET(str); 01085 p = RSTRING_PTR(str); 01086 e = RSTRING_END(str); 01087 cr = ENC_CODERANGE(str); 01088 #ifdef NONASCII_MASK 01089 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01090 enc == rb_utf8_encoding()) { 01091 01092 VALUE len = 0; 01093 if ((int)sizeof(VALUE) * 2 < e - p) { 01094 const VALUE *s, *t; 01095 const VALUE lowbits = sizeof(VALUE) - 1; 01096 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01097 t = (const VALUE*)(~lowbits & (VALUE)e); 01098 while (p < (const char *)s) { 01099 if (is_utf8_lead_byte(*p)) len++; 01100 p++; 01101 } 01102 while (s < t) { 01103 len += count_utf8_lead_bytes_with_word(s); 01104 s++; 01105 } 01106 p = (const char *)s; 01107 } 01108 while (p < e) { 01109 if (is_utf8_lead_byte(*p)) len++; 01110 p++; 01111 } 01112 return (long)len; 01113 } 01114 #endif 01115 n = rb_enc_strlen_cr(p, e, enc, &cr); 01116 if (cr) { 01117 ENC_CODERANGE_SET(str, cr); 01118 } 01119 return n; 01120 } 01121 01122 long 01123 rb_str_strlen(VALUE str) 01124 { 01125 return str_strlen(str, STR_ENC_GET(str)); 01126 } 01127 01128 /* 01129 * call-seq: 01130 * str.length -> integer 01131 * str.size -> integer 01132 * 01133 * Returns the character length of <i>str</i>. 01134 */ 01135 01136 VALUE 01137 rb_str_length(VALUE str) 01138 { 01139 long len; 01140 01141 len = str_strlen(str, STR_ENC_GET(str)); 01142 return LONG2NUM(len); 01143 } 01144 01145 /* 01146 * call-seq: 01147 * str.bytesize -> integer 01148 * 01149 * Returns the length of <i>str</i> in bytes. 01150 */ 01151 01152 static VALUE 01153 rb_str_bytesize(VALUE str) 01154 { 01155 return LONG2NUM(RSTRING_LEN(str)); 01156 } 01157 01158 /* 01159 * call-seq: 01160 * str.empty? -> true or false 01161 * 01162 * Returns <code>true</code> if <i>str</i> has a length of zero. 01163 * 01164 * "hello".empty? #=> false 01165 * "".empty? #=> true 01166 */ 01167 01168 static VALUE 01169 rb_str_empty(VALUE str) 01170 { 01171 if (RSTRING_LEN(str) == 0) 01172 return Qtrue; 01173 return Qfalse; 01174 } 01175 01176 /* 01177 * call-seq: 01178 * str + other_str -> new_str 01179 * 01180 * Concatenation---Returns a new <code>String</code> containing 01181 * <i>other_str</i> concatenated to <i>str</i>. 01182 * 01183 * "Hello from " + self.to_s #=> "Hello from main" 01184 */ 01185 01186 VALUE 01187 rb_str_plus(VALUE str1, VALUE str2) 01188 { 01189 VALUE str3; 01190 rb_encoding *enc; 01191 01192 StringValue(str2); 01193 enc = rb_enc_check(str1, str2); 01194 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2)); 01195 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1)); 01196 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1), 01197 RSTRING_PTR(str2), RSTRING_LEN(str2)); 01198 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0'; 01199 01200 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2)) 01201 OBJ_TAINT(str3); 01202 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc), 01203 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2))); 01204 return str3; 01205 } 01206 01207 /* 01208 * call-seq: 01209 * str * integer -> new_str 01210 * 01211 * Copy---Returns a new <code>String</code> containing <i>integer</i> copies of 01212 * the receiver. 01213 * 01214 * "Ho! " * 3 #=> "Ho! Ho! Ho! " 01215 */ 01216 01217 VALUE 01218 rb_str_times(VALUE str, VALUE times) 01219 { 01220 VALUE str2; 01221 long n, len; 01222 char *ptr2; 01223 01224 len = NUM2LONG(times); 01225 if (len < 0) { 01226 rb_raise(rb_eArgError, "negative argument"); 01227 } 01228 if (len && LONG_MAX/len < RSTRING_LEN(str)) { 01229 rb_raise(rb_eArgError, "argument too big"); 01230 } 01231 01232 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str)); 01233 ptr2 = RSTRING_PTR(str2); 01234 if (len) { 01235 n = RSTRING_LEN(str); 01236 memcpy(ptr2, RSTRING_PTR(str), n); 01237 while (n <= len/2) { 01238 memcpy(ptr2 + n, ptr2, n); 01239 n *= 2; 01240 } 01241 memcpy(ptr2 + n, ptr2, len-n); 01242 } 01243 ptr2[RSTRING_LEN(str2)] = '\0'; 01244 OBJ_INFECT(str2, str); 01245 rb_enc_cr_str_copy_for_substr(str2, str); 01246 01247 return str2; 01248 } 01249 01250 /* 01251 * call-seq: 01252 * str % arg -> new_str 01253 * 01254 * Format---Uses <i>str</i> as a format specification, and returns the result 01255 * of applying it to <i>arg</i>. If the format specification contains more than 01256 * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code> 01257 * containing the values to be substituted. See <code>Kernel::sprintf</code> for 01258 * details of the format string. 01259 * 01260 * "%05d" % 123 #=> "00123" 01261 * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6" 01262 * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar" 01263 */ 01264 01265 static VALUE 01266 rb_str_format_m(VALUE str, VALUE arg) 01267 { 01268 volatile VALUE tmp = rb_check_array_type(arg); 01269 01270 if (!NIL_P(tmp)) { 01271 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str); 01272 } 01273 return rb_str_format(1, &arg, str); 01274 } 01275 01276 static inline void 01277 str_modifiable(VALUE str) 01278 { 01279 if (FL_TEST(str, STR_TMPLOCK)) { 01280 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked"); 01281 } 01282 rb_check_frozen(str); 01283 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4) 01284 rb_raise(rb_eSecurityError, "Insecure: can't modify string"); 01285 } 01286 01287 static inline int 01288 str_independent(VALUE str) 01289 { 01290 str_modifiable(str); 01291 if (!STR_SHARED_P(str)) return 1; 01292 if (STR_EMBED_P(str)) return 1; 01293 return 0; 01294 } 01295 01296 static void 01297 str_make_independent_expand(VALUE str, long expand) 01298 { 01299 char *ptr; 01300 long len = RSTRING_LEN(str); 01301 long capa = len + expand; 01302 01303 if (len > capa) len = capa; 01304 ptr = ALLOC_N(char, capa + 1); 01305 if (RSTRING_PTR(str)) { 01306 memcpy(ptr, RSTRING_PTR(str), len); 01307 } 01308 STR_SET_NOEMBED(str); 01309 STR_UNSET_NOCAPA(str); 01310 ptr[len] = 0; 01311 RSTRING(str)->as.heap.ptr = ptr; 01312 RSTRING(str)->as.heap.len = len; 01313 RSTRING(str)->as.heap.aux.capa = capa; 01314 } 01315 01316 #define str_make_independent(str) str_make_independent_expand((str), 0L) 01317 01318 void 01319 rb_str_modify(VALUE str) 01320 { 01321 if (!str_independent(str)) 01322 str_make_independent(str); 01323 ENC_CODERANGE_CLEAR(str); 01324 } 01325 01326 void 01327 rb_str_modify_expand(VALUE str, long expand) 01328 { 01329 if (expand < 0) { 01330 rb_raise(rb_eArgError, "negative expanding string size"); 01331 } 01332 if (!str_independent(str)) { 01333 str_make_independent_expand(str, expand); 01334 } 01335 else if (expand > 0) { 01336 long len = RSTRING_LEN(str); 01337 long capa = len + expand; 01338 if (!STR_EMBED_P(str)) { 01339 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1); 01340 STR_UNSET_NOCAPA(str); 01341 RSTRING(str)->as.heap.aux.capa = capa; 01342 } 01343 else if (capa > RSTRING_EMBED_LEN_MAX) { 01344 str_make_independent_expand(str, expand); 01345 } 01346 } 01347 ENC_CODERANGE_CLEAR(str); 01348 } 01349 01350 /* As rb_str_modify(), but don't clear coderange */ 01351 static void 01352 str_modify_keep_cr(VALUE str) 01353 { 01354 if (!str_independent(str)) 01355 str_make_independent(str); 01356 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN) 01357 /* Force re-scan later */ 01358 ENC_CODERANGE_CLEAR(str); 01359 } 01360 01361 static inline void 01362 str_discard(VALUE str) 01363 { 01364 str_modifiable(str); 01365 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) { 01366 xfree(RSTRING_PTR(str)); 01367 RSTRING(str)->as.heap.ptr = 0; 01368 RSTRING(str)->as.heap.len = 0; 01369 } 01370 } 01371 01372 void 01373 rb_str_associate(VALUE str, VALUE add) 01374 { 01375 /* sanity check */ 01376 rb_check_frozen(str); 01377 if (STR_ASSOC_P(str)) { 01378 /* already associated */ 01379 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add); 01380 } 01381 else { 01382 if (STR_SHARED_P(str)) { 01383 VALUE assoc = RSTRING(str)->as.heap.aux.shared; 01384 str_make_independent(str); 01385 if (STR_ASSOC_P(assoc)) { 01386 assoc = RSTRING(assoc)->as.heap.aux.shared; 01387 rb_ary_concat(assoc, add); 01388 add = assoc; 01389 } 01390 } 01391 else if (STR_EMBED_P(str)) { 01392 str_make_independent(str); 01393 } 01394 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) { 01395 RESIZE_CAPA(str, RSTRING_LEN(str)); 01396 } 01397 FL_SET(str, STR_ASSOC); 01398 RBASIC(add)->klass = 0; 01399 RSTRING(str)->as.heap.aux.shared = add; 01400 } 01401 } 01402 01403 VALUE 01404 rb_str_associated(VALUE str) 01405 { 01406 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared; 01407 if (STR_ASSOC_P(str)) { 01408 return RSTRING(str)->as.heap.aux.shared; 01409 } 01410 return Qfalse; 01411 } 01412 01413 VALUE 01414 rb_string_value(volatile VALUE *ptr) 01415 { 01416 VALUE s = *ptr; 01417 if (TYPE(s) != T_STRING) { 01418 s = rb_str_to_str(s); 01419 *ptr = s; 01420 } 01421 return s; 01422 } 01423 01424 char * 01425 rb_string_value_ptr(volatile VALUE *ptr) 01426 { 01427 VALUE str = rb_string_value(ptr); 01428 return RSTRING_PTR(str); 01429 } 01430 01431 char * 01432 rb_string_value_cstr(volatile VALUE *ptr) 01433 { 01434 VALUE str = rb_string_value(ptr); 01435 char *s = RSTRING_PTR(str); 01436 long len = RSTRING_LEN(str); 01437 01438 if (!s || memchr(s, 0, len)) { 01439 rb_raise(rb_eArgError, "string contains null byte"); 01440 } 01441 if (s[len]) { 01442 rb_str_modify(str); 01443 s = RSTRING_PTR(str); 01444 s[RSTRING_LEN(str)] = 0; 01445 } 01446 return s; 01447 } 01448 01449 VALUE 01450 rb_check_string_type(VALUE str) 01451 { 01452 str = rb_check_convert_type(str, T_STRING, "String", "to_str"); 01453 return str; 01454 } 01455 01456 /* 01457 * call-seq: 01458 * String.try_convert(obj) -> string or nil 01459 * 01460 * Try to convert <i>obj</i> into a String, using to_str method. 01461 * Returns converted string or nil if <i>obj</i> cannot be converted 01462 * for any reason. 01463 * 01464 * String.try_convert("str") #=> "str" 01465 * String.try_convert(/re/) #=> nil 01466 */ 01467 static VALUE 01468 rb_str_s_try_convert(VALUE dummy, VALUE str) 01469 { 01470 return rb_check_string_type(str); 01471 } 01472 01473 static char* 01474 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc) 01475 { 01476 long nth = *nthp; 01477 if (rb_enc_mbmaxlen(enc) == 1) { 01478 p += nth; 01479 } 01480 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01481 p += nth * rb_enc_mbmaxlen(enc); 01482 } 01483 else if (rb_enc_asciicompat(enc)) { 01484 const char *p2, *e2; 01485 int n; 01486 01487 while (p < e && 0 < nth) { 01488 e2 = p + nth; 01489 if (e < e2) { 01490 *nthp = nth; 01491 return (char *)e; 01492 } 01493 if (ISASCII(*p)) { 01494 p2 = search_nonascii(p, e2); 01495 if (!p2) { 01496 *nthp = nth; 01497 return (char *)e2; 01498 } 01499 nth -= p2 - p; 01500 p = p2; 01501 } 01502 n = rb_enc_mbclen(p, e, enc); 01503 p += n; 01504 nth--; 01505 } 01506 *nthp = nth; 01507 if (nth != 0) { 01508 return (char *)e; 01509 } 01510 return (char *)p; 01511 } 01512 else { 01513 while (p < e && nth--) { 01514 p += rb_enc_mbclen(p, e, enc); 01515 } 01516 } 01517 if (p > e) p = e; 01518 *nthp = nth; 01519 return (char*)p; 01520 } 01521 01522 char* 01523 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc) 01524 { 01525 return str_nth_len(p, e, &nth, enc); 01526 } 01527 01528 static char* 01529 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01530 { 01531 if (singlebyte) 01532 p += nth; 01533 else { 01534 p = str_nth_len(p, e, &nth, enc); 01535 } 01536 if (!p) return 0; 01537 if (p > e) p = e; 01538 return (char *)p; 01539 } 01540 01541 /* char offset to byte offset */ 01542 static long 01543 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte) 01544 { 01545 const char *pp = str_nth(p, e, nth, enc, singlebyte); 01546 if (!pp) return e - p; 01547 return pp - p; 01548 } 01549 01550 long 01551 rb_str_offset(VALUE str, long pos) 01552 { 01553 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 01554 STR_ENC_GET(str), single_byte_optimizable(str)); 01555 } 01556 01557 #ifdef NONASCII_MASK 01558 static char * 01559 str_utf8_nth(const char *p, const char *e, long *nthp) 01560 { 01561 long nth = *nthp; 01562 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) { 01563 const VALUE *s, *t; 01564 const VALUE lowbits = sizeof(VALUE) - 1; 01565 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); 01566 t = (const VALUE*)(~lowbits & (VALUE)e); 01567 while (p < (const char *)s) { 01568 if (is_utf8_lead_byte(*p)) nth--; 01569 p++; 01570 } 01571 do { 01572 nth -= count_utf8_lead_bytes_with_word(s); 01573 s++; 01574 } while (s < t && (int)sizeof(VALUE) <= nth); 01575 p = (char *)s; 01576 } 01577 while (p < e) { 01578 if (is_utf8_lead_byte(*p)) { 01579 if (nth == 0) break; 01580 nth--; 01581 } 01582 p++; 01583 } 01584 *nthp = nth; 01585 return (char *)p; 01586 } 01587 01588 static long 01589 str_utf8_offset(const char *p, const char *e, long nth) 01590 { 01591 const char *pp = str_utf8_nth(p, e, &nth); 01592 return pp - p; 01593 } 01594 #endif 01595 01596 /* byte offset to char offset */ 01597 long 01598 rb_str_sublen(VALUE str, long pos) 01599 { 01600 if (single_byte_optimizable(str) || pos < 0) 01601 return pos; 01602 else { 01603 char *p = RSTRING_PTR(str); 01604 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str)); 01605 } 01606 } 01607 01608 VALUE 01609 rb_str_subseq(VALUE str, long beg, long len) 01610 { 01611 VALUE str2; 01612 01613 if (RSTRING_LEN(str) == beg + len && 01614 RSTRING_EMBED_LEN_MAX < len) { 01615 str2 = rb_str_new_shared(rb_str_new_frozen(str)); 01616 rb_str_drop_bytes(str2, beg); 01617 } 01618 else { 01619 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len); 01620 } 01621 01622 rb_enc_cr_str_copy_for_substr(str2, str); 01623 OBJ_INFECT(str2, str); 01624 01625 return str2; 01626 } 01627 01628 VALUE 01629 rb_str_substr(VALUE str, long beg, long len) 01630 { 01631 rb_encoding *enc = STR_ENC_GET(str); 01632 VALUE str2; 01633 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str); 01634 01635 if (len < 0) return Qnil; 01636 if (!RSTRING_LEN(str)) { 01637 len = 0; 01638 } 01639 if (single_byte_optimizable(str)) { 01640 if (beg > RSTRING_LEN(str)) return Qnil; 01641 if (beg < 0) { 01642 beg += RSTRING_LEN(str); 01643 if (beg < 0) return Qnil; 01644 } 01645 if (beg + len > RSTRING_LEN(str)) 01646 len = RSTRING_LEN(str) - beg; 01647 if (len <= 0) { 01648 len = 0; 01649 p = 0; 01650 } 01651 else 01652 p = s + beg; 01653 goto sub; 01654 } 01655 if (beg < 0) { 01656 if (len > -beg) len = -beg; 01657 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) { 01658 beg = -beg; 01659 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0); 01660 p = e; 01661 if (!p) return Qnil; 01662 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0); 01663 if (!p) return Qnil; 01664 len = e - p; 01665 goto sub; 01666 } 01667 else { 01668 beg += str_strlen(str, enc); 01669 if (beg < 0) return Qnil; 01670 } 01671 } 01672 else if (beg > 0 && beg > RSTRING_LEN(str)) { 01673 return Qnil; 01674 } 01675 if (len == 0) { 01676 if (beg > str_strlen(str, enc)) return Qnil; 01677 p = 0; 01678 } 01679 #ifdef NONASCII_MASK 01680 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && 01681 enc == rb_utf8_encoding()) { 01682 p = str_utf8_nth(s, e, &beg); 01683 if (beg > 0) return Qnil; 01684 len = str_utf8_offset(p, e, len); 01685 } 01686 #endif 01687 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { 01688 int char_sz = rb_enc_mbmaxlen(enc); 01689 01690 p = s + beg * char_sz; 01691 if (p > e) { 01692 return Qnil; 01693 } 01694 else if (len * char_sz > e - p) 01695 len = e - p; 01696 else 01697 len *= char_sz; 01698 } 01699 else if ((p = str_nth_len(s, e, &beg, enc)) == e) { 01700 if (beg > 0) return Qnil; 01701 len = 0; 01702 } 01703 else { 01704 len = str_offset(p, e, len, enc, 0); 01705 } 01706 sub: 01707 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) { 01708 str2 = rb_str_new4(str); 01709 str2 = str_new3(rb_obj_class(str2), str2); 01710 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 01711 RSTRING(str2)->as.heap.len = len; 01712 } 01713 else { 01714 str2 = rb_str_new5(str, p, len); 01715 rb_enc_cr_str_copy_for_substr(str2, str); 01716 OBJ_INFECT(str2, str); 01717 } 01718 01719 return str2; 01720 } 01721 01722 VALUE 01723 rb_str_freeze(VALUE str) 01724 { 01725 if (STR_ASSOC_P(str)) { 01726 VALUE ary = RSTRING(str)->as.heap.aux.shared; 01727 OBJ_FREEZE(ary); 01728 } 01729 return rb_obj_freeze(str); 01730 } 01731 01732 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str)) 01733 #define rb_str_dup_frozen rb_str_new_frozen 01734 01735 VALUE 01736 rb_str_locktmp(VALUE str) 01737 { 01738 if (FL_TEST(str, STR_TMPLOCK)) { 01739 rb_raise(rb_eRuntimeError, "temporal locking already locked string"); 01740 } 01741 FL_SET(str, STR_TMPLOCK); 01742 return str; 01743 } 01744 01745 VALUE 01746 rb_str_unlocktmp(VALUE str) 01747 { 01748 if (!FL_TEST(str, STR_TMPLOCK)) { 01749 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string"); 01750 } 01751 FL_UNSET(str, STR_TMPLOCK); 01752 return str; 01753 } 01754 01755 VALUE 01756 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg) 01757 { 01758 rb_str_locktmp(str); 01759 return rb_ensure(func, arg, rb_str_unlocktmp, str); 01760 } 01761 01762 void 01763 rb_str_set_len(VALUE str, long len) 01764 { 01765 long capa; 01766 01767 str_modifiable(str); 01768 if (STR_SHARED_P(str)) { 01769 rb_raise(rb_eRuntimeError, "can't set length of shared string"); 01770 } 01771 if (len > (capa = (long)rb_str_capacity(str))) { 01772 rb_bug("probable buffer overflow: %ld for %ld", len, capa); 01773 } 01774 STR_SET_LEN(str, len); 01775 RSTRING_PTR(str)[len] = '\0'; 01776 } 01777 01778 VALUE 01779 rb_str_resize(VALUE str, long len) 01780 { 01781 long slen; 01782 int independent; 01783 01784 if (len < 0) { 01785 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01786 } 01787 01788 independent = str_independent(str); 01789 ENC_CODERANGE_CLEAR(str); 01790 slen = RSTRING_LEN(str); 01791 if (len != slen) { 01792 if (STR_EMBED_P(str)) { 01793 if (len <= RSTRING_EMBED_LEN_MAX) { 01794 STR_SET_EMBED_LEN(str, len); 01795 RSTRING(str)->as.ary[len] = '\0'; 01796 return str; 01797 } 01798 str_make_independent_expand(str, len - slen); 01799 STR_SET_NOEMBED(str); 01800 } 01801 else if (len <= RSTRING_EMBED_LEN_MAX) { 01802 char *ptr = RSTRING(str)->as.heap.ptr; 01803 STR_SET_EMBED(str); 01804 if (slen > len) slen = len; 01805 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen); 01806 RSTRING(str)->as.ary[len] = '\0'; 01807 STR_SET_EMBED_LEN(str, len); 01808 if (independent) xfree(ptr); 01809 return str; 01810 } 01811 else if (!independent) { 01812 str_make_independent_expand(str, len - slen); 01813 } 01814 else if (slen < len || slen - len > 1024) { 01815 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1); 01816 } 01817 if (!STR_NOCAPA_P(str)) { 01818 RSTRING(str)->as.heap.aux.capa = len; 01819 } 01820 RSTRING(str)->as.heap.len = len; 01821 RSTRING(str)->as.heap.ptr[len] = '\0'; /* sentinel */ 01822 } 01823 return str; 01824 } 01825 01826 static VALUE 01827 str_buf_cat(VALUE str, const char *ptr, long len) 01828 { 01829 long capa, total, off = -1; 01830 01831 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) { 01832 off = ptr - RSTRING_PTR(str); 01833 } 01834 rb_str_modify(str); 01835 if (len == 0) return 0; 01836 if (STR_ASSOC_P(str)) { 01837 FL_UNSET(str, STR_ASSOC); 01838 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str); 01839 } 01840 else if (STR_EMBED_P(str)) { 01841 capa = RSTRING_EMBED_LEN_MAX; 01842 } 01843 else { 01844 capa = RSTRING(str)->as.heap.aux.capa; 01845 } 01846 if (RSTRING_LEN(str) >= LONG_MAX - len) { 01847 rb_raise(rb_eArgError, "string sizes too big"); 01848 } 01849 total = RSTRING_LEN(str)+len; 01850 if (capa <= total) { 01851 while (total > capa) { 01852 if (capa + 1 >= LONG_MAX / 2) { 01853 capa = (total + 4095) / 4096; 01854 break; 01855 } 01856 capa = (capa + 1) * 2; 01857 } 01858 RESIZE_CAPA(str, capa); 01859 } 01860 if (off != -1) { 01861 ptr = RSTRING_PTR(str) + off; 01862 } 01863 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len); 01864 STR_SET_LEN(str, total); 01865 RSTRING_PTR(str)[total] = '\0'; /* sentinel */ 01866 01867 return str; 01868 } 01869 01870 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr)) 01871 01872 VALUE 01873 rb_str_buf_cat(VALUE str, const char *ptr, long len) 01874 { 01875 if (len == 0) return str; 01876 if (len < 0) { 01877 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01878 } 01879 return str_buf_cat(str, ptr, len); 01880 } 01881 01882 VALUE 01883 rb_str_buf_cat2(VALUE str, const char *ptr) 01884 { 01885 return rb_str_buf_cat(str, ptr, strlen(ptr)); 01886 } 01887 01888 VALUE 01889 rb_str_cat(VALUE str, const char *ptr, long len) 01890 { 01891 if (len < 0) { 01892 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01893 } 01894 if (STR_ASSOC_P(str)) { 01895 char *p; 01896 rb_str_modify_expand(str, len); 01897 p = RSTRING(str)->as.heap.ptr; 01898 memcpy(p + RSTRING(str)->as.heap.len, ptr, len); 01899 len = RSTRING(str)->as.heap.len += len; 01900 p[len] = '\0'; /* sentinel */ 01901 return str; 01902 } 01903 01904 return rb_str_buf_cat(str, ptr, len); 01905 } 01906 01907 VALUE 01908 rb_str_cat2(VALUE str, const char *ptr) 01909 { 01910 return rb_str_cat(str, ptr, strlen(ptr)); 01911 } 01912 01913 static VALUE 01914 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len, 01915 int ptr_encindex, int ptr_cr, int *ptr_cr_ret) 01916 { 01917 int str_encindex = ENCODING_GET(str); 01918 int res_encindex; 01919 int str_cr, res_cr; 01920 01921 str_cr = ENC_CODERANGE(str); 01922 01923 if (str_encindex == ptr_encindex) { 01924 if (str_cr == ENC_CODERANGE_UNKNOWN) 01925 ptr_cr = ENC_CODERANGE_UNKNOWN; 01926 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 01927 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex)); 01928 } 01929 } 01930 else { 01931 rb_encoding *str_enc = rb_enc_from_index(str_encindex); 01932 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex); 01933 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) { 01934 if (len == 0) 01935 return str; 01936 if (RSTRING_LEN(str) == 0) { 01937 rb_str_buf_cat(str, ptr, len); 01938 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr); 01939 return str; 01940 } 01941 goto incompatible; 01942 } 01943 if (ptr_cr == ENC_CODERANGE_UNKNOWN) { 01944 ptr_cr = coderange_scan(ptr, len, ptr_enc); 01945 } 01946 if (str_cr == ENC_CODERANGE_UNKNOWN) { 01947 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) { 01948 str_cr = rb_enc_str_coderange(str); 01949 } 01950 } 01951 } 01952 if (ptr_cr_ret) 01953 *ptr_cr_ret = ptr_cr; 01954 01955 if (str_encindex != ptr_encindex && 01956 str_cr != ENC_CODERANGE_7BIT && 01957 ptr_cr != ENC_CODERANGE_7BIT) { 01958 incompatible: 01959 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 01960 rb_enc_name(rb_enc_from_index(str_encindex)), 01961 rb_enc_name(rb_enc_from_index(ptr_encindex))); 01962 } 01963 01964 if (str_cr == ENC_CODERANGE_UNKNOWN) { 01965 res_encindex = str_encindex; 01966 res_cr = ENC_CODERANGE_UNKNOWN; 01967 } 01968 else if (str_cr == ENC_CODERANGE_7BIT) { 01969 if (ptr_cr == ENC_CODERANGE_7BIT) { 01970 res_encindex = str_encindex; 01971 res_cr = ENC_CODERANGE_7BIT; 01972 } 01973 else { 01974 res_encindex = ptr_encindex; 01975 res_cr = ptr_cr; 01976 } 01977 } 01978 else if (str_cr == ENC_CODERANGE_VALID) { 01979 res_encindex = str_encindex; 01980 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID) 01981 res_cr = str_cr; 01982 else 01983 res_cr = ptr_cr; 01984 } 01985 else { /* str_cr == ENC_CODERANGE_BROKEN */ 01986 res_encindex = str_encindex; 01987 res_cr = str_cr; 01988 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN; 01989 } 01990 01991 if (len < 0) { 01992 rb_raise(rb_eArgError, "negative string size (or size too big)"); 01993 } 01994 str_buf_cat(str, ptr, len); 01995 ENCODING_CODERANGE_SET(str, res_encindex, res_cr); 01996 return str; 01997 } 01998 01999 VALUE 02000 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc) 02001 { 02002 return rb_enc_cr_str_buf_cat(str, ptr, len, 02003 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL); 02004 } 02005 02006 VALUE 02007 rb_str_buf_cat_ascii(VALUE str, const char *ptr) 02008 { 02009 /* ptr must reference NUL terminated ASCII string. */ 02010 int encindex = ENCODING_GET(str); 02011 rb_encoding *enc = rb_enc_from_index(encindex); 02012 if (rb_enc_asciicompat(enc)) { 02013 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr), 02014 encindex, ENC_CODERANGE_7BIT, 0); 02015 } 02016 else { 02017 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc)); 02018 while (*ptr) { 02019 unsigned int c = (unsigned char)*ptr; 02020 int len = rb_enc_codelen(c, enc); 02021 rb_enc_mbcput(c, buf, enc); 02022 rb_enc_cr_str_buf_cat(str, buf, len, 02023 encindex, ENC_CODERANGE_VALID, 0); 02024 ptr++; 02025 } 02026 return str; 02027 } 02028 } 02029 02030 VALUE 02031 rb_str_buf_append(VALUE str, VALUE str2) 02032 { 02033 int str2_cr; 02034 02035 str2_cr = ENC_CODERANGE(str2); 02036 02037 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2), 02038 ENCODING_GET(str2), str2_cr, &str2_cr); 02039 02040 OBJ_INFECT(str, str2); 02041 ENC_CODERANGE_SET(str2, str2_cr); 02042 02043 return str; 02044 } 02045 02046 VALUE 02047 rb_str_append(VALUE str, VALUE str2) 02048 { 02049 rb_encoding *enc; 02050 int cr, cr2; 02051 long len2; 02052 02053 StringValue(str2); 02054 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) { 02055 long len = RSTRING_LEN(str) + len2; 02056 enc = rb_enc_check(str, str2); 02057 cr = ENC_CODERANGE(str); 02058 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2; 02059 rb_str_modify_expand(str, len2); 02060 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len, 02061 RSTRING_PTR(str2), len2+1); 02062 RSTRING(str)->as.heap.len = len; 02063 rb_enc_associate(str, enc); 02064 ENC_CODERANGE_SET(str, cr); 02065 OBJ_INFECT(str, str2); 02066 return str; 02067 } 02068 return rb_str_buf_append(str, str2); 02069 } 02070 02071 /* 02072 * call-seq: 02073 * str << integer -> str 02074 * str.concat(integer) -> str 02075 * str << obj -> str 02076 * str.concat(obj) -> str 02077 * 02078 * Append---Concatenates the given object to <i>str</i>. If the object is a 02079 * <code>Integer</code>, it is considered as a codepoint, and is converted 02080 * to a character before concatenation. 02081 * 02082 * a = "hello " 02083 * a << "world" #=> "hello world" 02084 * a.concat(33) #=> "hello world!" 02085 */ 02086 02087 VALUE 02088 rb_str_concat(VALUE str1, VALUE str2) 02089 { 02090 unsigned int code; 02091 rb_encoding *enc = STR_ENC_GET(str1); 02092 02093 if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) { 02094 if (rb_num_to_uint(str2, &code) == 0) { 02095 } 02096 else if (FIXNUM_P(str2)) { 02097 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2)); 02098 } 02099 else { 02100 rb_raise(rb_eRangeError, "bignum out of char range"); 02101 } 02102 } 02103 else { 02104 return rb_str_append(str1, str2); 02105 } 02106 02107 if (enc == rb_usascii_encoding()) { 02108 /* US-ASCII automatically extended to ASCII-8BIT */ 02109 char buf[1]; 02110 buf[0] = (char)code; 02111 if (code > 0xFF) { 02112 rb_raise(rb_eRangeError, "%u out of char range", code); 02113 } 02114 rb_str_cat(str1, buf, 1); 02115 if (code > 127) { 02116 rb_enc_associate(str1, rb_ascii8bit_encoding()); 02117 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); 02118 } 02119 } 02120 else { 02121 long pos = RSTRING_LEN(str1); 02122 int cr = ENC_CODERANGE(str1); 02123 int len; 02124 char *buf; 02125 02126 switch (len = rb_enc_codelen(code, enc)) { 02127 case ONIGERR_INVALID_CODE_POINT_VALUE: 02128 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02129 break; 02130 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: 02131 case 0: 02132 rb_raise(rb_eRangeError, "%u out of char range", code); 02133 break; 02134 } 02135 buf = ALLOCA_N(char, len + 1); 02136 rb_enc_mbcput(code, buf, enc); 02137 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) { 02138 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); 02139 } 02140 rb_str_resize(str1, pos+len); 02141 strncpy(RSTRING_PTR(str1) + pos, buf, len); 02142 if (cr == ENC_CODERANGE_7BIT && code > 127) 02143 cr = ENC_CODERANGE_VALID; 02144 ENC_CODERANGE_SET(str1, cr); 02145 } 02146 return str1; 02147 } 02148 02149 /* 02150 * call-seq: 02151 * str.prepend(other_str) -> str 02152 * 02153 * Prepend---Prepend the given string to <i>str</i>. 02154 * 02155 * a = "world" 02156 * a.prepend("hello ") #=> "hello world" 02157 * a #=> "hello world" 02158 */ 02159 02160 static VALUE 02161 rb_str_prepend(VALUE str, VALUE str2) 02162 { 02163 StringValue(str2); 02164 StringValue(str); 02165 rb_str_update(str, 0L, 0L, str2); 02166 return str; 02167 } 02168 02169 st_index_t 02170 rb_str_hash(VALUE str) 02171 { 02172 int e = ENCODING_GET(str); 02173 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { 02174 e = 0; 02175 } 02176 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e; 02177 } 02178 02179 int 02180 rb_str_hash_cmp(VALUE str1, VALUE str2) 02181 { 02182 long len; 02183 02184 if (!rb_str_comparable(str1, str2)) return 1; 02185 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) && 02186 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) { 02187 return 0; 02188 } 02189 return 1; 02190 } 02191 02192 /* 02193 * call-seq: 02194 * str.hash -> fixnum 02195 * 02196 * Return a hash based on the string's length and content. 02197 */ 02198 02199 static VALUE 02200 rb_str_hash_m(VALUE str) 02201 { 02202 st_index_t hval = rb_str_hash(str); 02203 return INT2FIX(hval); 02204 } 02205 02206 #define lesser(a,b) (((a)>(b))?(b):(a)) 02207 02208 int 02209 rb_str_comparable(VALUE str1, VALUE str2) 02210 { 02211 int idx1, idx2; 02212 int rc1, rc2; 02213 02214 if (RSTRING_LEN(str1) == 0) return TRUE; 02215 if (RSTRING_LEN(str2) == 0) return TRUE; 02216 idx1 = ENCODING_GET(str1); 02217 idx2 = ENCODING_GET(str2); 02218 if (idx1 == idx2) return TRUE; 02219 rc1 = rb_enc_str_coderange(str1); 02220 rc2 = rb_enc_str_coderange(str2); 02221 if (rc1 == ENC_CODERANGE_7BIT) { 02222 if (rc2 == ENC_CODERANGE_7BIT) return TRUE; 02223 if (rb_enc_asciicompat(rb_enc_from_index(idx2))) 02224 return TRUE; 02225 } 02226 if (rc2 == ENC_CODERANGE_7BIT) { 02227 if (rb_enc_asciicompat(rb_enc_from_index(idx1))) 02228 return TRUE; 02229 } 02230 return FALSE; 02231 } 02232 02233 int 02234 rb_str_cmp(VALUE str1, VALUE str2) 02235 { 02236 long len1, len2; 02237 const char *ptr1, *ptr2; 02238 int retval; 02239 02240 if (str1 == str2) return 0; 02241 RSTRING_GETMEM(str1, ptr1, len1); 02242 RSTRING_GETMEM(str2, ptr2, len2); 02243 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) { 02244 if (len1 == len2) { 02245 if (!rb_str_comparable(str1, str2)) { 02246 if (ENCODING_GET(str1) > ENCODING_GET(str2)) 02247 return 1; 02248 return -1; 02249 } 02250 return 0; 02251 } 02252 if (len1 > len2) return 1; 02253 return -1; 02254 } 02255 if (retval > 0) return 1; 02256 return -1; 02257 } 02258 02259 /* expect tail call optimization */ 02260 static VALUE 02261 str_eql(const VALUE str1, const VALUE str2) 02262 { 02263 const long len = RSTRING_LEN(str1); 02264 const char *ptr1, *ptr2; 02265 02266 if (len != RSTRING_LEN(str2)) return Qfalse; 02267 if (!rb_str_comparable(str1, str2)) return Qfalse; 02268 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2))) 02269 return Qtrue; 02270 if (memcmp(ptr1, ptr2, len) == 0) 02271 return Qtrue; 02272 return Qfalse; 02273 } 02274 /* 02275 * call-seq: 02276 * str == obj -> true or false 02277 * 02278 * Equality---If <i>obj</i> is not a <code>String</code>, returns 02279 * <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i> 02280 * <code><=></code> <i>obj</i> returns zero. 02281 */ 02282 02283 VALUE 02284 rb_str_equal(VALUE str1, VALUE str2) 02285 { 02286 if (str1 == str2) return Qtrue; 02287 if (TYPE(str2) != T_STRING) { 02288 if (!rb_respond_to(str2, rb_intern("to_str"))) { 02289 return Qfalse; 02290 } 02291 return rb_equal(str2, str1); 02292 } 02293 return str_eql(str1, str2); 02294 } 02295 02296 /* 02297 * call-seq: 02298 * str.eql?(other) -> true or false 02299 * 02300 * Two strings are equal if they have the same length and content. 02301 */ 02302 02303 static VALUE 02304 rb_str_eql(VALUE str1, VALUE str2) 02305 { 02306 if (str1 == str2) return Qtrue; 02307 if (TYPE(str2) != T_STRING) return Qfalse; 02308 return str_eql(str1, str2); 02309 } 02310 02311 /* 02312 * call-seq: 02313 * str <=> other_str -> -1, 0, +1 or nil 02314 * 02315 * Comparison---Returns -1 if <i>other_str</i> is greater than, 0 if 02316 * <i>other_str</i> is equal to, and +1 if <i>other_str</i> is less than 02317 * <i>str</i>. If the strings are of different lengths, and the strings are 02318 * equal when compared up to the shortest length, then the longer string is 02319 * considered greater than the shorter one. In older versions of Ruby, setting 02320 * <code>$=</code> allowed case-insensitive comparisons; this is now deprecated 02321 * in favor of using <code>String#casecmp</code>. 02322 * 02323 * <code><=></code> is the basis for the methods <code><</code>, 02324 * <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>, 02325 * included from module <code>Comparable</code>. The method 02326 * <code>String#==</code> does not use <code>Comparable#==</code>. 02327 * 02328 * "abcdef" <=> "abcde" #=> 1 02329 * "abcdef" <=> "abcdef" #=> 0 02330 * "abcdef" <=> "abcdefg" #=> -1 02331 * "abcdef" <=> "ABCDEF" #=> 1 02332 */ 02333 02334 static VALUE 02335 rb_str_cmp_m(VALUE str1, VALUE str2) 02336 { 02337 long result; 02338 02339 if (TYPE(str2) != T_STRING) { 02340 if (!rb_respond_to(str2, rb_intern("to_str"))) { 02341 return Qnil; 02342 } 02343 else if (!rb_respond_to(str2, rb_intern("<=>"))) { 02344 return Qnil; 02345 } 02346 else { 02347 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1); 02348 02349 if (NIL_P(tmp)) return Qnil; 02350 if (!FIXNUM_P(tmp)) { 02351 return rb_funcall(LONG2FIX(0), '-', 1, tmp); 02352 } 02353 result = -FIX2LONG(tmp); 02354 } 02355 } 02356 else { 02357 result = rb_str_cmp(str1, str2); 02358 } 02359 return LONG2NUM(result); 02360 } 02361 02362 /* 02363 * call-seq: 02364 * str.casecmp(other_str) -> -1, 0, +1 or nil 02365 * 02366 * Case-insensitive version of <code>String#<=></code>. 02367 * 02368 * "abcdef".casecmp("abcde") #=> 1 02369 * "aBcDeF".casecmp("abcdef") #=> 0 02370 * "abcdef".casecmp("abcdefg") #=> -1 02371 * "abcdef".casecmp("ABCDEF") #=> 0 02372 */ 02373 02374 static VALUE 02375 rb_str_casecmp(VALUE str1, VALUE str2) 02376 { 02377 long len; 02378 rb_encoding *enc; 02379 char *p1, *p1end, *p2, *p2end; 02380 02381 StringValue(str2); 02382 enc = rb_enc_compatible(str1, str2); 02383 if (!enc) { 02384 return Qnil; 02385 } 02386 02387 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1); 02388 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2); 02389 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) { 02390 while (p1 < p1end && p2 < p2end) { 02391 if (*p1 != *p2) { 02392 unsigned int c1 = TOUPPER(*p1 & 0xff); 02393 unsigned int c2 = TOUPPER(*p2 & 0xff); 02394 if (c1 != c2) 02395 return INT2FIX(c1 < c2 ? -1 : 1); 02396 } 02397 p1++; 02398 p2++; 02399 } 02400 } 02401 else { 02402 while (p1 < p1end && p2 < p2end) { 02403 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc); 02404 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc); 02405 02406 if (0 <= c1 && 0 <= c2) { 02407 c1 = TOUPPER(c1); 02408 c2 = TOUPPER(c2); 02409 if (c1 != c2) 02410 return INT2FIX(c1 < c2 ? -1 : 1); 02411 } 02412 else { 02413 int r; 02414 l1 = rb_enc_mbclen(p1, p1end, enc); 02415 l2 = rb_enc_mbclen(p2, p2end, enc); 02416 len = l1 < l2 ? l1 : l2; 02417 r = memcmp(p1, p2, len); 02418 if (r != 0) 02419 return INT2FIX(r < 0 ? -1 : 1); 02420 if (l1 != l2) 02421 return INT2FIX(l1 < l2 ? -1 : 1); 02422 } 02423 p1 += l1; 02424 p2 += l2; 02425 } 02426 } 02427 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0); 02428 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1); 02429 return INT2FIX(-1); 02430 } 02431 02432 static long 02433 rb_str_index(VALUE str, VALUE sub, long offset) 02434 { 02435 long pos; 02436 char *s, *sptr, *e; 02437 long len, slen; 02438 rb_encoding *enc; 02439 02440 enc = rb_enc_check(str, sub); 02441 if (is_broken_string(sub)) { 02442 return -1; 02443 } 02444 len = str_strlen(str, enc); 02445 slen = str_strlen(sub, enc); 02446 if (offset < 0) { 02447 offset += len; 02448 if (offset < 0) return -1; 02449 } 02450 if (len - offset < slen) return -1; 02451 s = RSTRING_PTR(str); 02452 e = s + RSTRING_LEN(str); 02453 if (offset) { 02454 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str)); 02455 s += offset; 02456 } 02457 if (slen == 0) return offset; 02458 /* need proceed one character at a time */ 02459 sptr = RSTRING_PTR(sub); 02460 slen = RSTRING_LEN(sub); 02461 len = RSTRING_LEN(str) - offset; 02462 for (;;) { 02463 char *t; 02464 pos = rb_memsearch(sptr, slen, s, len, enc); 02465 if (pos < 0) return pos; 02466 t = rb_enc_right_char_head(s, s+pos, e, enc); 02467 if (t == s + pos) break; 02468 if ((len -= t - s) <= 0) return -1; 02469 offset += t - s; 02470 s = t; 02471 } 02472 return pos + offset; 02473 } 02474 02475 02476 /* 02477 * call-seq: 02478 * str.index(substring [, offset]) -> fixnum or nil 02479 * str.index(regexp [, offset]) -> fixnum or nil 02480 * 02481 * Returns the index of the first occurrence of the given <i>substring</i> or 02482 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02483 * found. If the second parameter is present, it specifies the position in the 02484 * string to begin the search. 02485 * 02486 * "hello".index('e') #=> 1 02487 * "hello".index('lo') #=> 3 02488 * "hello".index('a') #=> nil 02489 * "hello".index(?e) #=> 1 02490 * "hello".index(/[aeiou]/, -3) #=> 4 02491 */ 02492 02493 static VALUE 02494 rb_str_index_m(int argc, VALUE *argv, VALUE str) 02495 { 02496 VALUE sub; 02497 VALUE initpos; 02498 long pos; 02499 02500 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) { 02501 pos = NUM2LONG(initpos); 02502 } 02503 else { 02504 pos = 0; 02505 } 02506 if (pos < 0) { 02507 pos += str_strlen(str, STR_ENC_GET(str)); 02508 if (pos < 0) { 02509 if (TYPE(sub) == T_REGEXP) { 02510 rb_backref_set(Qnil); 02511 } 02512 return Qnil; 02513 } 02514 } 02515 02516 switch (TYPE(sub)) { 02517 case T_REGEXP: 02518 if (pos > str_strlen(str, STR_ENC_GET(str))) 02519 return Qnil; 02520 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02521 rb_enc_check(str, sub), single_byte_optimizable(str)); 02522 02523 pos = rb_reg_search(sub, str, pos, 0); 02524 pos = rb_str_sublen(str, pos); 02525 break; 02526 02527 default: { 02528 VALUE tmp; 02529 02530 tmp = rb_check_string_type(sub); 02531 if (NIL_P(tmp)) { 02532 rb_raise(rb_eTypeError, "type mismatch: %s given", 02533 rb_obj_classname(sub)); 02534 } 02535 sub = tmp; 02536 } 02537 /* fall through */ 02538 case T_STRING: 02539 pos = rb_str_index(str, sub, pos); 02540 pos = rb_str_sublen(str, pos); 02541 break; 02542 } 02543 02544 if (pos == -1) return Qnil; 02545 return LONG2NUM(pos); 02546 } 02547 02548 static long 02549 rb_str_rindex(VALUE str, VALUE sub, long pos) 02550 { 02551 long len, slen; 02552 char *s, *sbeg, *e, *t; 02553 rb_encoding *enc; 02554 int singlebyte = single_byte_optimizable(str); 02555 02556 enc = rb_enc_check(str, sub); 02557 if (is_broken_string(sub)) { 02558 return -1; 02559 } 02560 len = str_strlen(str, enc); 02561 slen = str_strlen(sub, enc); 02562 /* substring longer than string */ 02563 if (len < slen) return -1; 02564 if (len - pos < slen) { 02565 pos = len - slen; 02566 } 02567 if (len == 0) { 02568 return pos; 02569 } 02570 sbeg = RSTRING_PTR(str); 02571 e = RSTRING_END(str); 02572 t = RSTRING_PTR(sub); 02573 slen = RSTRING_LEN(sub); 02574 s = str_nth(sbeg, e, pos, enc, singlebyte); 02575 while (s) { 02576 if (memcmp(s, t, slen) == 0) { 02577 return pos; 02578 } 02579 if (pos == 0) break; 02580 pos--; 02581 s = rb_enc_prev_char(sbeg, s, e, enc); 02582 } 02583 return -1; 02584 } 02585 02586 02587 /* 02588 * call-seq: 02589 * str.rindex(substring [, fixnum]) -> fixnum or nil 02590 * str.rindex(regexp [, fixnum]) -> fixnum or nil 02591 * 02592 * Returns the index of the last occurrence of the given <i>substring</i> or 02593 * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not 02594 * found. If the second parameter is present, it specifies the position in the 02595 * string to end the search---characters beyond this point will not be 02596 * considered. 02597 * 02598 * "hello".rindex('e') #=> 1 02599 * "hello".rindex('l') #=> 3 02600 * "hello".rindex('a') #=> nil 02601 * "hello".rindex(?e) #=> 1 02602 * "hello".rindex(/[aeiou]/, -2) #=> 1 02603 */ 02604 02605 static VALUE 02606 rb_str_rindex_m(int argc, VALUE *argv, VALUE str) 02607 { 02608 VALUE sub; 02609 VALUE vpos; 02610 rb_encoding *enc = STR_ENC_GET(str); 02611 long pos, len = str_strlen(str, enc); 02612 02613 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) { 02614 pos = NUM2LONG(vpos); 02615 if (pos < 0) { 02616 pos += len; 02617 if (pos < 0) { 02618 if (TYPE(sub) == T_REGEXP) { 02619 rb_backref_set(Qnil); 02620 } 02621 return Qnil; 02622 } 02623 } 02624 if (pos > len) pos = len; 02625 } 02626 else { 02627 pos = len; 02628 } 02629 02630 switch (TYPE(sub)) { 02631 case T_REGEXP: 02632 /* enc = rb_get_check(str, sub); */ 02633 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos, 02634 STR_ENC_GET(str), single_byte_optimizable(str)); 02635 02636 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) { 02637 pos = rb_reg_search(sub, str, pos, 1); 02638 pos = rb_str_sublen(str, pos); 02639 } 02640 if (pos >= 0) return LONG2NUM(pos); 02641 break; 02642 02643 default: { 02644 VALUE tmp; 02645 02646 tmp = rb_check_string_type(sub); 02647 if (NIL_P(tmp)) { 02648 rb_raise(rb_eTypeError, "type mismatch: %s given", 02649 rb_obj_classname(sub)); 02650 } 02651 sub = tmp; 02652 } 02653 /* fall through */ 02654 case T_STRING: 02655 pos = rb_str_rindex(str, sub, pos); 02656 if (pos >= 0) return LONG2NUM(pos); 02657 break; 02658 } 02659 return Qnil; 02660 } 02661 02662 /* 02663 * call-seq: 02664 * str =~ obj -> fixnum or nil 02665 * 02666 * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match 02667 * against <i>str</i>,and returns the position the match starts, or 02668 * <code>nil</code> if there is no match. Otherwise, invokes 02669 * <i>obj.=~</i>, passing <i>str</i> as an argument. The default 02670 * <code>=~</code> in <code>Object</code> returns <code>nil</code>. 02671 * 02672 * "cat o' 9 tails" =~ /\d/ #=> 7 02673 * "cat o' 9 tails" =~ 9 #=> nil 02674 */ 02675 02676 static VALUE 02677 rb_str_match(VALUE x, VALUE y) 02678 { 02679 switch (TYPE(y)) { 02680 case T_STRING: 02681 rb_raise(rb_eTypeError, "type mismatch: String given"); 02682 02683 case T_REGEXP: 02684 return rb_reg_match(y, x); 02685 02686 default: 02687 return rb_funcall(y, rb_intern("=~"), 1, x); 02688 } 02689 } 02690 02691 02692 static VALUE get_pat(VALUE, int); 02693 02694 02695 /* 02696 * call-seq: 02697 * str.match(pattern) -> matchdata or nil 02698 * str.match(pattern, pos) -> matchdata or nil 02699 * 02700 * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one), 02701 * then invokes its <code>match</code> method on <i>str</i>. If the second 02702 * parameter is present, it specifies the position in the string to begin the 02703 * search. 02704 * 02705 * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l"> 02706 * 'hello'.match('(.)\1')[0] #=> "ll" 02707 * 'hello'.match(/(.)\1/)[0] #=> "ll" 02708 * 'hello'.match('xx') #=> nil 02709 * 02710 * If a block is given, invoke the block with MatchData if match succeed, so 02711 * that you can write 02712 * 02713 * str.match(pat) {|m| ...} 02714 * 02715 * instead of 02716 * 02717 * if m = str.match(pat) 02718 * ... 02719 * end 02720 * 02721 * The return value is a value from block execution in this case. 02722 */ 02723 02724 static VALUE 02725 rb_str_match_m(int argc, VALUE *argv, VALUE str) 02726 { 02727 VALUE re, result; 02728 if (argc < 1) 02729 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 02730 re = argv[0]; 02731 argv[0] = str; 02732 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv); 02733 if (!NIL_P(result) && rb_block_given_p()) { 02734 return rb_yield(result); 02735 } 02736 return result; 02737 } 02738 02739 enum neighbor_char { 02740 NEIGHBOR_NOT_CHAR, 02741 NEIGHBOR_FOUND, 02742 NEIGHBOR_WRAPPED 02743 }; 02744 02745 static enum neighbor_char 02746 enc_succ_char(char *p, long len, rb_encoding *enc) 02747 { 02748 long i; 02749 int l; 02750 while (1) { 02751 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--) 02752 p[i] = '\0'; 02753 if (i < 0) 02754 return NEIGHBOR_WRAPPED; 02755 ++((unsigned char*)p)[i]; 02756 l = rb_enc_precise_mbclen(p, p+len, enc); 02757 if (MBCLEN_CHARFOUND_P(l)) { 02758 l = MBCLEN_CHARFOUND_LEN(l); 02759 if (l == len) { 02760 return NEIGHBOR_FOUND; 02761 } 02762 else { 02763 memset(p+l, 0xff, len-l); 02764 } 02765 } 02766 if (MBCLEN_INVALID_P(l) && i < len-1) { 02767 long len2; 02768 int l2; 02769 for (len2 = len-1; 0 < len2; len2--) { 02770 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02771 if (!MBCLEN_INVALID_P(l2)) 02772 break; 02773 } 02774 memset(p+len2+1, 0xff, len-(len2+1)); 02775 } 02776 } 02777 } 02778 02779 static enum neighbor_char 02780 enc_pred_char(char *p, long len, rb_encoding *enc) 02781 { 02782 long i; 02783 int l; 02784 while (1) { 02785 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--) 02786 p[i] = '\xff'; 02787 if (i < 0) 02788 return NEIGHBOR_WRAPPED; 02789 --((unsigned char*)p)[i]; 02790 l = rb_enc_precise_mbclen(p, p+len, enc); 02791 if (MBCLEN_CHARFOUND_P(l)) { 02792 l = MBCLEN_CHARFOUND_LEN(l); 02793 if (l == len) { 02794 return NEIGHBOR_FOUND; 02795 } 02796 else { 02797 memset(p+l, 0, len-l); 02798 } 02799 } 02800 if (MBCLEN_INVALID_P(l) && i < len-1) { 02801 long len2; 02802 int l2; 02803 for (len2 = len-1; 0 < len2; len2--) { 02804 l2 = rb_enc_precise_mbclen(p, p+len2, enc); 02805 if (!MBCLEN_INVALID_P(l2)) 02806 break; 02807 } 02808 memset(p+len2+1, 0, len-(len2+1)); 02809 } 02810 } 02811 } 02812 02813 /* 02814 overwrite +p+ by succeeding letter in +enc+ and returns 02815 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED. 02816 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry. 02817 assuming each ranges are successive, and mbclen 02818 never change in each ranges. 02819 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one 02820 character. 02821 */ 02822 static enum neighbor_char 02823 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry) 02824 { 02825 enum neighbor_char ret; 02826 unsigned int c; 02827 int ctype; 02828 int range; 02829 char save[ONIGENC_CODE_TO_MBC_MAXLEN]; 02830 02831 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02832 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc)) 02833 ctype = ONIGENC_CTYPE_DIGIT; 02834 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc)) 02835 ctype = ONIGENC_CTYPE_ALPHA; 02836 else 02837 return NEIGHBOR_NOT_CHAR; 02838 02839 MEMCPY(save, p, char, len); 02840 ret = enc_succ_char(p, len, enc); 02841 if (ret == NEIGHBOR_FOUND) { 02842 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02843 if (rb_enc_isctype(c, ctype, enc)) 02844 return NEIGHBOR_FOUND; 02845 } 02846 MEMCPY(p, save, char, len); 02847 range = 1; 02848 while (1) { 02849 MEMCPY(save, p, char, len); 02850 ret = enc_pred_char(p, len, enc); 02851 if (ret == NEIGHBOR_FOUND) { 02852 c = rb_enc_mbc_to_codepoint(p, p+len, enc); 02853 if (!rb_enc_isctype(c, ctype, enc)) { 02854 MEMCPY(p, save, char, len); 02855 break; 02856 } 02857 } 02858 else { 02859 MEMCPY(p, save, char, len); 02860 break; 02861 } 02862 range++; 02863 } 02864 if (range == 1) { 02865 return NEIGHBOR_NOT_CHAR; 02866 } 02867 02868 if (ctype != ONIGENC_CTYPE_DIGIT) { 02869 MEMCPY(carry, p, char, len); 02870 return NEIGHBOR_WRAPPED; 02871 } 02872 02873 MEMCPY(carry, p, char, len); 02874 enc_succ_char(carry, len, enc); 02875 return NEIGHBOR_WRAPPED; 02876 } 02877 02878 02879 /* 02880 * call-seq: 02881 * str.succ -> new_str 02882 * str.next -> new_str 02883 * 02884 * Returns the successor to <i>str</i>. The successor is calculated by 02885 * incrementing characters starting from the rightmost alphanumeric (or 02886 * the rightmost character if there are no alphanumerics) in the 02887 * string. Incrementing a digit always results in another digit, and 02888 * incrementing a letter results in another letter of the same case. 02889 * Incrementing nonalphanumerics uses the underlying character set's 02890 * collating sequence. 02891 * 02892 * If the increment generates a ``carry,'' the character to the left of 02893 * it is incremented. This process repeats until there is no carry, 02894 * adding an additional character if necessary. 02895 * 02896 * "abcd".succ #=> "abce" 02897 * "THX1138".succ #=> "THX1139" 02898 * "<<koala>>".succ #=> "<<koalb>>" 02899 * "1999zzz".succ #=> "2000aaa" 02900 * "ZZZ9999".succ #=> "AAAA0000" 02901 * "***".succ #=> "**+" 02902 */ 02903 02904 VALUE 02905 rb_str_succ(VALUE orig) 02906 { 02907 rb_encoding *enc; 02908 VALUE str; 02909 char *sbeg, *s, *e, *last_alnum = 0; 02910 int c = -1; 02911 long l; 02912 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1"; 02913 long carry_pos = 0, carry_len = 1; 02914 enum neighbor_char neighbor = NEIGHBOR_FOUND; 02915 02916 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig)); 02917 rb_enc_cr_str_copy_for_substr(str, orig); 02918 OBJ_INFECT(str, orig); 02919 if (RSTRING_LEN(str) == 0) return str; 02920 02921 enc = STR_ENC_GET(orig); 02922 sbeg = RSTRING_PTR(str); 02923 s = e = sbeg + RSTRING_LEN(str); 02924 02925 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 02926 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) { 02927 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) : 02928 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) { 02929 s = last_alnum; 02930 break; 02931 } 02932 } 02933 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 02934 neighbor = enc_succ_alnum_char(s, l, enc, carry); 02935 switch (neighbor) { 02936 case NEIGHBOR_NOT_CHAR: 02937 continue; 02938 case NEIGHBOR_FOUND: 02939 return str; 02940 case NEIGHBOR_WRAPPED: 02941 last_alnum = s; 02942 break; 02943 } 02944 c = 1; 02945 carry_pos = s - sbeg; 02946 carry_len = l; 02947 } 02948 if (c == -1) { /* str contains no alnum */ 02949 s = e; 02950 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) { 02951 enum neighbor_char neighbor; 02952 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue; 02953 neighbor = enc_succ_char(s, l, enc); 02954 if (neighbor == NEIGHBOR_FOUND) 02955 return str; 02956 if (rb_enc_precise_mbclen(s, s+l, enc) != l) { 02957 /* wrapped to \0...\0. search next valid char. */ 02958 enc_succ_char(s, l, enc); 02959 } 02960 if (!rb_enc_asciicompat(enc)) { 02961 MEMCPY(carry, s, char, l); 02962 carry_len = l; 02963 } 02964 carry_pos = s - sbeg; 02965 } 02966 } 02967 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len); 02968 s = RSTRING_PTR(str) + carry_pos; 02969 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos); 02970 memmove(s, carry, carry_len); 02971 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len); 02972 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 02973 rb_enc_str_coderange(str); 02974 return str; 02975 } 02976 02977 02978 /* 02979 * call-seq: 02980 * str.succ! -> str 02981 * str.next! -> str 02982 * 02983 * Equivalent to <code>String#succ</code>, but modifies the receiver in 02984 * place. 02985 */ 02986 02987 static VALUE 02988 rb_str_succ_bang(VALUE str) 02989 { 02990 rb_str_shared_replace(str, rb_str_succ(str)); 02991 02992 return str; 02993 } 02994 02995 02996 /* 02997 * call-seq: 02998 * str.upto(other_str, exclusive=false) {|s| block } -> str 02999 * str.upto(other_str, exclusive=false) -> an_enumerator 03000 * 03001 * Iterates through successive values, starting at <i>str</i> and 03002 * ending at <i>other_str</i> inclusive, passing each value in turn to 03003 * the block. The <code>String#succ</code> method is used to generate 03004 * each value. If optional second argument exclusive is omitted or is false, 03005 * the last value will be included; otherwise it will be excluded. 03006 * 03007 * If no block is given, an enumerator is returned instead. 03008 * 03009 * "a8".upto("b6") {|s| print s, ' ' } 03010 * for s in "a8".."b6" 03011 * print s, ' ' 03012 * end 03013 * 03014 * <em>produces:</em> 03015 * 03016 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03017 * a8 a9 b0 b1 b2 b3 b4 b5 b6 03018 * 03019 * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters, 03020 * both are recognized as decimal numbers. In addition, the width of 03021 * string (e.g. leading zeros) is handled appropriately. 03022 * 03023 * "9".upto("11").to_a #=> ["9", "10", "11"] 03024 * "25".upto("5").to_a #=> [] 03025 * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"] 03026 */ 03027 03028 static VALUE 03029 rb_str_upto(int argc, VALUE *argv, VALUE beg) 03030 { 03031 VALUE end, exclusive; 03032 VALUE current, after_end; 03033 ID succ; 03034 int n, excl, ascii; 03035 rb_encoding *enc; 03036 03037 rb_scan_args(argc, argv, "11", &end, &exclusive); 03038 RETURN_ENUMERATOR(beg, argc, argv); 03039 excl = RTEST(exclusive); 03040 CONST_ID(succ, "succ"); 03041 StringValue(end); 03042 enc = rb_enc_check(beg, end); 03043 ascii = (is_ascii_string(beg) && is_ascii_string(end)); 03044 /* single character */ 03045 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) { 03046 char c = RSTRING_PTR(beg)[0]; 03047 char e = RSTRING_PTR(end)[0]; 03048 03049 if (c > e || (excl && c == e)) return beg; 03050 for (;;) { 03051 rb_yield(rb_enc_str_new(&c, 1, enc)); 03052 if (!excl && c == e) break; 03053 c++; 03054 if (excl && c == e) break; 03055 } 03056 return beg; 03057 } 03058 /* both edges are all digits */ 03059 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) { 03060 char *s, *send; 03061 VALUE b, e; 03062 int width; 03063 03064 s = RSTRING_PTR(beg); send = RSTRING_END(beg); 03065 width = rb_long2int(send - s); 03066 while (s < send) { 03067 if (!ISDIGIT(*s)) goto no_digits; 03068 s++; 03069 } 03070 s = RSTRING_PTR(end); send = RSTRING_END(end); 03071 while (s < send) { 03072 if (!ISDIGIT(*s)) goto no_digits; 03073 s++; 03074 } 03075 b = rb_str_to_inum(beg, 10, FALSE); 03076 e = rb_str_to_inum(end, 10, FALSE); 03077 if (FIXNUM_P(b) && FIXNUM_P(e)) { 03078 long bi = FIX2LONG(b); 03079 long ei = FIX2LONG(e); 03080 rb_encoding *usascii = rb_usascii_encoding(); 03081 03082 while (bi <= ei) { 03083 if (excl && bi == ei) break; 03084 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi)); 03085 bi++; 03086 } 03087 } 03088 else { 03089 ID op = excl ? '<' : rb_intern("<="); 03090 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d")); 03091 03092 args[0] = INT2FIX(width); 03093 while (rb_funcall(b, op, 1, e)) { 03094 args[1] = b; 03095 rb_yield(rb_str_format(numberof(args), args, fmt)); 03096 b = rb_funcall(b, succ, 0, 0); 03097 } 03098 } 03099 return beg; 03100 } 03101 /* normal case */ 03102 no_digits: 03103 n = rb_str_cmp(beg, end); 03104 if (n > 0 || (excl && n == 0)) return beg; 03105 03106 after_end = rb_funcall(end, succ, 0, 0); 03107 current = rb_str_dup(beg); 03108 while (!rb_str_equal(current, after_end)) { 03109 VALUE next = Qnil; 03110 if (excl || !rb_str_equal(current, end)) 03111 next = rb_funcall(current, succ, 0, 0); 03112 rb_yield(current); 03113 if (NIL_P(next)) break; 03114 current = next; 03115 StringValue(current); 03116 if (excl && rb_str_equal(current, end)) break; 03117 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0) 03118 break; 03119 } 03120 03121 return beg; 03122 } 03123 03124 static VALUE 03125 rb_str_subpat(VALUE str, VALUE re, VALUE backref) 03126 { 03127 if (rb_reg_search(re, str, 0, 0) >= 0) { 03128 VALUE match = rb_backref_get(); 03129 int nth = rb_reg_backref_number(match, backref); 03130 return rb_reg_nth_match(nth, match); 03131 } 03132 return Qnil; 03133 } 03134 03135 static VALUE 03136 rb_str_aref(VALUE str, VALUE indx) 03137 { 03138 long idx; 03139 03140 switch (TYPE(indx)) { 03141 case T_FIXNUM: 03142 idx = FIX2LONG(indx); 03143 03144 num_index: 03145 str = rb_str_substr(str, idx, 1); 03146 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil; 03147 return str; 03148 03149 case T_REGEXP: 03150 return rb_str_subpat(str, indx, INT2FIX(0)); 03151 03152 case T_STRING: 03153 if (rb_str_index(str, indx, 0) != -1) 03154 return rb_str_dup(indx); 03155 return Qnil; 03156 03157 default: 03158 /* check if indx is Range */ 03159 { 03160 long beg, len; 03161 VALUE tmp; 03162 03163 len = str_strlen(str, STR_ENC_GET(str)); 03164 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 03165 case Qfalse: 03166 break; 03167 case Qnil: 03168 return Qnil; 03169 default: 03170 tmp = rb_str_substr(str, beg, len); 03171 return tmp; 03172 } 03173 } 03174 idx = NUM2LONG(indx); 03175 goto num_index; 03176 } 03177 return Qnil; /* not reached */ 03178 } 03179 03180 03181 /* 03182 * call-seq: 03183 * str[fixnum] -> new_str or nil 03184 * str[fixnum, fixnum] -> new_str or nil 03185 * str[range] -> new_str or nil 03186 * str[regexp] -> new_str or nil 03187 * str[regexp, fixnum] -> new_str or nil 03188 * str[other_str] -> new_str or nil 03189 * str.slice(fixnum) -> new_str or nil 03190 * str.slice(fixnum, fixnum) -> new_str or nil 03191 * str.slice(range) -> new_str or nil 03192 * str.slice(regexp) -> new_str or nil 03193 * str.slice(regexp, fixnum) -> new_str or nil 03194 * str.slice(regexp, capname) -> new_str or nil 03195 * str.slice(other_str) -> new_str or nil 03196 * 03197 * Element Reference---If passed a single <code>Fixnum</code>, returns a 03198 * substring of one character at that position. If passed two <code>Fixnum</code> 03199 * objects, returns a substring starting at the offset given by the first, and 03200 * with a length given by the second. If passed a range, its beginning and end 03201 * are interpreted as offsets delimiting the substring to be returned. In all 03202 * three cases, if an offset is negative, it is counted from the end of <i>str</i>. 03203 * Returns <code>nil</code> if the initial offset falls outside the string or 03204 * the length is negative. 03205 * 03206 * If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is 03207 * returned. If a numeric or name parameter follows the regular expression, that 03208 * component of the <code>MatchData</code> is returned instead. If a 03209 * <code>String</code> is given, that string is returned if it occurs in 03210 * <i>str</i>. In both cases, <code>nil</code> is returned if there is no 03211 * match. 03212 * 03213 * a = "hello there" 03214 * a[1] #=> "e" 03215 * a[2, 3] #=> "llo" 03216 * a[2..3] #=> "ll" 03217 * a[-3, 2] #=> "er" 03218 * a[7..-2] #=> "her" 03219 * a[-4..-2] #=> "her" 03220 * a[-2..-4] #=> "" 03221 * a[12..-1] #=> nil 03222 * a[/[aeiou](.)\1/] #=> "ell" 03223 * a[/[aeiou](.)\1/, 0] #=> "ell" 03224 * a[/[aeiou](.)\1/, 1] #=> "l" 03225 * a[/[aeiou](.)\1/, 2] #=> nil 03226 * a["lo"] #=> "lo" 03227 * a["bye"] #=> nil 03228 */ 03229 03230 static VALUE 03231 rb_str_aref_m(int argc, VALUE *argv, VALUE str) 03232 { 03233 if (argc == 2) { 03234 if (TYPE(argv[0]) == T_REGEXP) { 03235 return rb_str_subpat(str, argv[0], argv[1]); 03236 } 03237 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 03238 } 03239 if (argc != 1) { 03240 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03241 } 03242 return rb_str_aref(str, argv[0]); 03243 } 03244 03245 VALUE 03246 rb_str_drop_bytes(VALUE str, long len) 03247 { 03248 char *ptr = RSTRING_PTR(str); 03249 long olen = RSTRING_LEN(str), nlen; 03250 03251 str_modifiable(str); 03252 if (len > olen) len = olen; 03253 nlen = olen - len; 03254 if (nlen <= RSTRING_EMBED_LEN_MAX) { 03255 char *oldptr = ptr; 03256 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED)); 03257 STR_SET_EMBED(str); 03258 STR_SET_EMBED_LEN(str, nlen); 03259 ptr = RSTRING(str)->as.ary; 03260 memmove(ptr, oldptr + len, nlen); 03261 if (fl == STR_NOEMBED) xfree(oldptr); 03262 } 03263 else { 03264 if (!STR_SHARED_P(str)) rb_str_new4(str); 03265 ptr = RSTRING(str)->as.heap.ptr += len; 03266 RSTRING(str)->as.heap.len = nlen; 03267 } 03268 ptr[nlen] = 0; 03269 ENC_CODERANGE_CLEAR(str); 03270 return str; 03271 } 03272 03273 static void 03274 rb_str_splice_0(VALUE str, long beg, long len, VALUE val) 03275 { 03276 if (beg == 0 && RSTRING_LEN(val) == 0) { 03277 rb_str_drop_bytes(str, len); 03278 OBJ_INFECT(str, val); 03279 return; 03280 } 03281 03282 rb_str_modify(str); 03283 if (len < RSTRING_LEN(val)) { 03284 /* expand string */ 03285 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1); 03286 } 03287 03288 if (RSTRING_LEN(val) != len) { 03289 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val), 03290 RSTRING_PTR(str) + beg + len, 03291 RSTRING_LEN(str) - (beg + len)); 03292 } 03293 if (RSTRING_LEN(val) < beg && len < 0) { 03294 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len); 03295 } 03296 if (RSTRING_LEN(val) > 0) { 03297 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val)); 03298 } 03299 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len); 03300 if (RSTRING_PTR(str)) { 03301 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 03302 } 03303 OBJ_INFECT(str, val); 03304 } 03305 03306 static void 03307 rb_str_splice(VALUE str, long beg, long len, VALUE val) 03308 { 03309 long slen; 03310 char *p, *e; 03311 rb_encoding *enc; 03312 int singlebyte = single_byte_optimizable(str); 03313 int cr; 03314 03315 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 03316 03317 StringValue(val); 03318 enc = rb_enc_check(str, val); 03319 slen = str_strlen(str, enc); 03320 03321 if (slen < beg) { 03322 out_of_range: 03323 rb_raise(rb_eIndexError, "index %ld out of string", beg); 03324 } 03325 if (beg < 0) { 03326 if (-beg > slen) { 03327 goto out_of_range; 03328 } 03329 beg += slen; 03330 } 03331 if (slen < len || slen < beg + len) { 03332 len = slen - beg; 03333 } 03334 str_modify_keep_cr(str); 03335 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte); 03336 if (!p) p = RSTRING_END(str); 03337 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte); 03338 if (!e) e = RSTRING_END(str); 03339 /* error check */ 03340 beg = p - RSTRING_PTR(str); /* physical position */ 03341 len = e - p; /* physical length */ 03342 rb_str_splice_0(str, beg, len, val); 03343 rb_enc_associate(str, enc); 03344 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val)); 03345 if (cr != ENC_CODERANGE_BROKEN) 03346 ENC_CODERANGE_SET(str, cr); 03347 } 03348 03349 void 03350 rb_str_update(VALUE str, long beg, long len, VALUE val) 03351 { 03352 rb_str_splice(str, beg, len, val); 03353 } 03354 03355 static void 03356 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val) 03357 { 03358 int nth; 03359 VALUE match; 03360 long start, end, len; 03361 rb_encoding *enc; 03362 struct re_registers *regs; 03363 03364 if (rb_reg_search(re, str, 0, 0) < 0) { 03365 rb_raise(rb_eIndexError, "regexp not matched"); 03366 } 03367 match = rb_backref_get(); 03368 nth = rb_reg_backref_number(match, backref); 03369 regs = RMATCH_REGS(match); 03370 if (nth >= regs->num_regs) { 03371 out_of_range: 03372 rb_raise(rb_eIndexError, "index %d out of regexp", nth); 03373 } 03374 if (nth < 0) { 03375 if (-nth >= regs->num_regs) { 03376 goto out_of_range; 03377 } 03378 nth += regs->num_regs; 03379 } 03380 03381 start = BEG(nth); 03382 if (start == -1) { 03383 rb_raise(rb_eIndexError, "regexp group %d not matched", nth); 03384 } 03385 end = END(nth); 03386 len = end - start; 03387 StringValue(val); 03388 enc = rb_enc_check(str, val); 03389 rb_str_splice_0(str, start, len, val); 03390 rb_enc_associate(str, enc); 03391 } 03392 03393 static VALUE 03394 rb_str_aset(VALUE str, VALUE indx, VALUE val) 03395 { 03396 long idx, beg; 03397 03398 switch (TYPE(indx)) { 03399 case T_FIXNUM: 03400 idx = FIX2LONG(indx); 03401 num_index: 03402 rb_str_splice(str, idx, 1, val); 03403 return val; 03404 03405 case T_REGEXP: 03406 rb_str_subpat_set(str, indx, INT2FIX(0), val); 03407 return val; 03408 03409 case T_STRING: 03410 beg = rb_str_index(str, indx, 0); 03411 if (beg < 0) { 03412 rb_raise(rb_eIndexError, "string not matched"); 03413 } 03414 beg = rb_str_sublen(str, beg); 03415 rb_str_splice(str, beg, str_strlen(indx, 0), val); 03416 return val; 03417 03418 default: 03419 /* check if indx is Range */ 03420 { 03421 long beg, len; 03422 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) { 03423 rb_str_splice(str, beg, len, val); 03424 return val; 03425 } 03426 } 03427 idx = NUM2LONG(indx); 03428 goto num_index; 03429 } 03430 } 03431 03432 /* 03433 * call-seq: 03434 * str[fixnum] = new_str 03435 * str[fixnum, fixnum] = new_str 03436 * str[range] = aString 03437 * str[regexp] = new_str 03438 * str[regexp, fixnum] = new_str 03439 * str[regexp, name] = new_str 03440 * str[other_str] = new_str 03441 * 03442 * Element Assignment---Replaces some or all of the content of <i>str</i>. The 03443 * portion of the string affected is determined using the same criteria as 03444 * <code>String#[]</code>. If the replacement string is not the same length as 03445 * the text it is replacing, the string will be adjusted accordingly. If the 03446 * regular expression or string is used as the index doesn't match a position 03447 * in the string, <code>IndexError</code> is raised. If the regular expression 03448 * form is used, the optional second <code>Fixnum</code> allows you to specify 03449 * which portion of the match to replace (effectively using the 03450 * <code>MatchData</code> indexing rules. The forms that take a 03451 * <code>Fixnum</code> will raise an <code>IndexError</code> if the value is 03452 * out of range; the <code>Range</code> form will raise a 03453 * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code> 03454 * forms will silently ignore the assignment. 03455 */ 03456 03457 static VALUE 03458 rb_str_aset_m(int argc, VALUE *argv, VALUE str) 03459 { 03460 if (argc == 3) { 03461 if (TYPE(argv[0]) == T_REGEXP) { 03462 rb_str_subpat_set(str, argv[0], argv[1], argv[2]); 03463 } 03464 else { 03465 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]); 03466 } 03467 return argv[2]; 03468 } 03469 if (argc != 2) { 03470 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc); 03471 } 03472 return rb_str_aset(str, argv[0], argv[1]); 03473 } 03474 03475 /* 03476 * call-seq: 03477 * str.insert(index, other_str) -> str 03478 * 03479 * Inserts <i>other_str</i> before the character at the given 03480 * <i>index</i>, modifying <i>str</i>. Negative indices count from the 03481 * end of the string, and insert <em>after</em> the given character. 03482 * The intent is insert <i>aString</i> so that it starts at the given 03483 * <i>index</i>. 03484 * 03485 * "abcd".insert(0, 'X') #=> "Xabcd" 03486 * "abcd".insert(3, 'X') #=> "abcXd" 03487 * "abcd".insert(4, 'X') #=> "abcdX" 03488 * "abcd".insert(-3, 'X') #=> "abXcd" 03489 * "abcd".insert(-1, 'X') #=> "abcdX" 03490 */ 03491 03492 static VALUE 03493 rb_str_insert(VALUE str, VALUE idx, VALUE str2) 03494 { 03495 long pos = NUM2LONG(idx); 03496 03497 if (pos == -1) { 03498 return rb_str_append(str, str2); 03499 } 03500 else if (pos < 0) { 03501 pos++; 03502 } 03503 rb_str_splice(str, pos, 0, str2); 03504 return str; 03505 } 03506 03507 03508 /* 03509 * call-seq: 03510 * str.slice!(fixnum) -> fixnum or nil 03511 * str.slice!(fixnum, fixnum) -> new_str or nil 03512 * str.slice!(range) -> new_str or nil 03513 * str.slice!(regexp) -> new_str or nil 03514 * str.slice!(other_str) -> new_str or nil 03515 * 03516 * Deletes the specified portion from <i>str</i>, and returns the portion 03517 * deleted. 03518 * 03519 * string = "this is a string" 03520 * string.slice!(2) #=> "i" 03521 * string.slice!(3..6) #=> " is " 03522 * string.slice!(/s.*t/) #=> "sa st" 03523 * string.slice!("r") #=> "r" 03524 * string #=> "thing" 03525 */ 03526 03527 static VALUE 03528 rb_str_slice_bang(int argc, VALUE *argv, VALUE str) 03529 { 03530 VALUE result; 03531 VALUE buf[3]; 03532 int i; 03533 03534 if (argc < 1 || 2 < argc) { 03535 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03536 } 03537 for (i=0; i<argc; i++) { 03538 buf[i] = argv[i]; 03539 } 03540 str_modify_keep_cr(str); 03541 result = rb_str_aref_m(argc, buf, str); 03542 if (!NIL_P(result)) { 03543 buf[i] = rb_str_new(0,0); 03544 rb_str_aset_m(argc+1, buf, str); 03545 } 03546 return result; 03547 } 03548 03549 static VALUE 03550 get_pat(VALUE pat, int quote) 03551 { 03552 VALUE val; 03553 03554 switch (TYPE(pat)) { 03555 case T_REGEXP: 03556 return pat; 03557 03558 case T_STRING: 03559 break; 03560 03561 default: 03562 val = rb_check_string_type(pat); 03563 if (NIL_P(val)) { 03564 Check_Type(pat, T_REGEXP); 03565 } 03566 pat = val; 03567 } 03568 03569 if (quote) { 03570 pat = rb_reg_quote(pat); 03571 } 03572 03573 return rb_reg_regcomp(pat); 03574 } 03575 03576 03577 /* 03578 * call-seq: 03579 * str.sub!(pattern, replacement) -> str or nil 03580 * str.sub!(pattern) {|match| block } -> str or nil 03581 * 03582 * Performs the substitutions of <code>String#sub</code> in place, 03583 * returning <i>str</i>, or <code>nil</code> if no substitutions were 03584 * performed. 03585 */ 03586 03587 static VALUE 03588 rb_str_sub_bang(int argc, VALUE *argv, VALUE str) 03589 { 03590 VALUE pat, repl, hash = Qnil; 03591 int iter = 0; 03592 int tainted = 0; 03593 int untrusted = 0; 03594 long plen; 03595 03596 if (argc == 1 && rb_block_given_p()) { 03597 iter = 1; 03598 } 03599 else if (argc == 2) { 03600 repl = argv[1]; 03601 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash"); 03602 if (NIL_P(hash)) { 03603 StringValue(repl); 03604 } 03605 if (OBJ_TAINTED(repl)) tainted = 1; 03606 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03607 } 03608 else { 03609 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03610 } 03611 03612 pat = get_pat(argv[0], 1); 03613 str_modifiable(str); 03614 if (rb_reg_search(pat, str, 0, 0) >= 0) { 03615 rb_encoding *enc; 03616 int cr = ENC_CODERANGE(str); 03617 VALUE match = rb_backref_get(); 03618 struct re_registers *regs = RMATCH_REGS(match); 03619 long beg0 = BEG(0); 03620 long end0 = END(0); 03621 char *p, *rp; 03622 long len, rlen; 03623 03624 if (iter || !NIL_P(hash)) { 03625 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03626 03627 if (iter) { 03628 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03629 } 03630 else { 03631 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); 03632 repl = rb_obj_as_string(repl); 03633 } 03634 str_mod_check(str, p, len); 03635 rb_check_frozen(str); 03636 } 03637 else { 03638 repl = rb_reg_regsub(repl, str, regs, pat); 03639 } 03640 enc = rb_enc_compatible(str, repl); 03641 if (!enc) { 03642 rb_encoding *str_enc = STR_ENC_GET(str); 03643 p = RSTRING_PTR(str); len = RSTRING_LEN(str); 03644 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT || 03645 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) { 03646 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", 03647 rb_enc_name(str_enc), 03648 rb_enc_name(STR_ENC_GET(repl))); 03649 } 03650 enc = STR_ENC_GET(repl); 03651 } 03652 rb_str_modify(str); 03653 rb_enc_associate(str, enc); 03654 if (OBJ_TAINTED(repl)) tainted = 1; 03655 if (OBJ_UNTRUSTED(repl)) untrusted = 1; 03656 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) { 03657 int cr2 = ENC_CODERANGE(repl); 03658 if (cr2 == ENC_CODERANGE_BROKEN || 03659 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT)) 03660 cr = ENC_CODERANGE_UNKNOWN; 03661 else 03662 cr = cr2; 03663 } 03664 plen = end0 - beg0; 03665 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl); 03666 len = RSTRING_LEN(str); 03667 if (rlen > plen) { 03668 RESIZE_CAPA(str, len + rlen - plen); 03669 } 03670 p = RSTRING_PTR(str); 03671 if (rlen != plen) { 03672 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen); 03673 } 03674 memcpy(p + beg0, rp, rlen); 03675 len += rlen - plen; 03676 STR_SET_LEN(str, len); 03677 RSTRING_PTR(str)[len] = '\0'; 03678 ENC_CODERANGE_SET(str, cr); 03679 if (tainted) OBJ_TAINT(str); 03680 if (untrusted) OBJ_UNTRUST(str); 03681 03682 return str; 03683 } 03684 return Qnil; 03685 } 03686 03687 03688 /* 03689 * call-seq: 03690 * str.sub(pattern, replacement) -> new_str 03691 * str.sub(pattern, hash) -> new_str 03692 * str.sub(pattern) {|match| block } -> new_str 03693 * 03694 * Returns a copy of <i>str</i> with the <em>first</em> occurrence of 03695 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 03696 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 03697 * regular expression metacharacters it contains will be interpreted 03698 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 03699 * instead of a digit. 03700 * 03701 * If <i>replacement</i> is a <code>String</code> it will be substituted for 03702 * the matched text. It may contain back-references to the pattern's capture 03703 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 03704 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 03705 * double-quoted string, both back-references must be preceded by an 03706 * additional backslash. However, within <i>replacement</i> the special match 03707 * variables, such as <code>&$</code>, will not refer to the current match. 03708 * 03709 * If the second argument is a <code>Hash</code>, and the matched text is one 03710 * of its keys, the corresponding value is the replacement string. 03711 * 03712 * In the block form, the current match string is passed in as a parameter, 03713 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03714 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03715 * returned by the block will be substituted for the match on each call. 03716 * 03717 * The result inherits any tainting in the original string or any supplied 03718 * replacement string. 03719 * 03720 * "hello".sub(/[aeiou]/, '*') #=> "h*llo" 03721 * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo" 03722 * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello" 03723 * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo" 03724 * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV) 03725 * #=> "Is /bin/bash your preferred shell?" 03726 */ 03727 03728 static VALUE 03729 rb_str_sub(int argc, VALUE *argv, VALUE str) 03730 { 03731 str = rb_str_dup(str); 03732 rb_str_sub_bang(argc, argv, str); 03733 return str; 03734 } 03735 03736 static VALUE 03737 str_gsub(int argc, VALUE *argv, VALUE str, int bang) 03738 { 03739 VALUE pat, val, repl, match, dest, hash = Qnil; 03740 struct re_registers *regs; 03741 long beg, n; 03742 long beg0, end0; 03743 long offset, blen, slen, len, last; 03744 int iter = 0; 03745 char *sp, *cp; 03746 int tainted = 0; 03747 rb_encoding *str_enc; 03748 03749 switch (argc) { 03750 case 1: 03751 RETURN_ENUMERATOR(str, argc, argv); 03752 iter = 1; 03753 break; 03754 case 2: 03755 repl = argv[1]; 03756 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash"); 03757 if (NIL_P(hash)) { 03758 StringValue(repl); 03759 } 03760 if (OBJ_TAINTED(repl)) tainted = 1; 03761 break; 03762 default: 03763 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 03764 } 03765 03766 pat = get_pat(argv[0], 1); 03767 beg = rb_reg_search(pat, str, 0, 0); 03768 if (beg < 0) { 03769 if (bang) return Qnil; /* no match, no substitution */ 03770 return rb_str_dup(str); 03771 } 03772 03773 offset = 0; 03774 n = 0; 03775 blen = RSTRING_LEN(str) + 30; /* len + margin */ 03776 dest = rb_str_buf_new(blen); 03777 sp = RSTRING_PTR(str); 03778 slen = RSTRING_LEN(str); 03779 cp = sp; 03780 str_enc = STR_ENC_GET(str); 03781 rb_enc_associate(dest, str_enc); 03782 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); 03783 03784 do { 03785 n++; 03786 match = rb_backref_get(); 03787 regs = RMATCH_REGS(match); 03788 beg0 = BEG(0); 03789 end0 = END(0); 03790 if (iter || !NIL_P(hash)) { 03791 if (iter) { 03792 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match))); 03793 } 03794 else { 03795 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0))); 03796 val = rb_obj_as_string(val); 03797 } 03798 str_mod_check(str, sp, slen); 03799 if (val == dest) { /* paranoid check [ruby-dev:24827] */ 03800 rb_raise(rb_eRuntimeError, "block should not cheat"); 03801 } 03802 } 03803 else { 03804 val = rb_reg_regsub(repl, str, regs, pat); 03805 } 03806 03807 if (OBJ_TAINTED(val)) tainted = 1; 03808 03809 len = beg - offset; /* copy pre-match substr */ 03810 if (len) { 03811 rb_enc_str_buf_cat(dest, cp, len, str_enc); 03812 } 03813 03814 rb_str_buf_append(dest, val); 03815 03816 last = offset; 03817 offset = end0; 03818 if (beg0 == end0) { 03819 /* 03820 * Always consume at least one character of the input string 03821 * in order to prevent infinite loops. 03822 */ 03823 if (RSTRING_LEN(str) <= end0) break; 03824 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); 03825 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc); 03826 offset = end0 + len; 03827 } 03828 cp = RSTRING_PTR(str) + offset; 03829 if (offset > RSTRING_LEN(str)) break; 03830 beg = rb_reg_search(pat, str, offset, 0); 03831 } while (beg >= 0); 03832 if (RSTRING_LEN(str) > offset) { 03833 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); 03834 } 03835 rb_reg_search(pat, str, last, 0); 03836 if (bang) { 03837 rb_str_shared_replace(str, dest); 03838 } 03839 else { 03840 RBASIC(dest)->klass = rb_obj_class(str); 03841 OBJ_INFECT(dest, str); 03842 str = dest; 03843 } 03844 03845 if (tainted) OBJ_TAINT(str); 03846 return str; 03847 } 03848 03849 03850 /* 03851 * call-seq: 03852 * str.gsub!(pattern, replacement) -> str or nil 03853 * str.gsub!(pattern) {|match| block } -> str or nil 03854 * str.gsub!(pattern) -> an_enumerator 03855 * 03856 * Performs the substitutions of <code>String#gsub</code> in place, returning 03857 * <i>str</i>, or <code>nil</code> if no substitutions were performed. 03858 * If no block and no <i>replacement</i> is given, an enumerator is returned instead. 03859 */ 03860 03861 static VALUE 03862 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str) 03863 { 03864 str_modify_keep_cr(str); 03865 return str_gsub(argc, argv, str, 1); 03866 } 03867 03868 03869 /* 03870 * call-seq: 03871 * str.gsub(pattern, replacement) -> new_str 03872 * str.gsub(pattern, hash) -> new_str 03873 * str.gsub(pattern) {|match| block } -> new_str 03874 * str.gsub(pattern) -> enumerator 03875 * 03876 * Returns a copy of <i>str</i> with the <em>all</em> occurrences of 03877 * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is 03878 * typically a <code>Regexp</code>; if given as a <code>String</code>, any 03879 * regular expression metacharacters it contains will be interpreted 03880 * literally, e.g. <code>'\\\d'</code> will match a backlash followed by 'd', 03881 * instead of a digit. 03882 * 03883 * If <i>replacement</i> is a <code>String</code> it will be substituted for 03884 * the matched text. It may contain back-references to the pattern's capture 03885 * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or 03886 * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a 03887 * double-quoted string, both back-references must be preceded by an 03888 * additional backslash. However, within <i>replacement</i> the special match 03889 * variables, such as <code>&$</code>, will not refer to the current match. 03890 * 03891 * If the second argument is a <code>Hash</code>, and the matched text is one 03892 * of its keys, the corresponding value is the replacement string. 03893 * 03894 * In the block form, the current match string is passed in as a parameter, 03895 * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>, 03896 * <code>$&</code>, and <code>$'</code> will be set appropriately. The value 03897 * returned by the block will be substituted for the match on each call. 03898 * 03899 * The result inherits any tainting in the original string or any supplied 03900 * replacement string. 03901 * 03902 * When neither a block nor a second argument is supplied, an 03903 * <code>Enumerator</code> is returned. 03904 * 03905 * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*" 03906 * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>" 03907 * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 " 03908 * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}" 03909 * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*" 03910 */ 03911 03912 static VALUE 03913 rb_str_gsub(int argc, VALUE *argv, VALUE str) 03914 { 03915 return str_gsub(argc, argv, str, 0); 03916 } 03917 03918 03919 /* 03920 * call-seq: 03921 * str.replace(other_str) -> str 03922 * 03923 * Replaces the contents and taintedness of <i>str</i> with the corresponding 03924 * values in <i>other_str</i>. 03925 * 03926 * s = "hello" #=> "hello" 03927 * s.replace "world" #=> "world" 03928 */ 03929 03930 VALUE 03931 rb_str_replace(VALUE str, VALUE str2) 03932 { 03933 str_modifiable(str); 03934 if (str == str2) return str; 03935 03936 StringValue(str2); 03937 str_discard(str); 03938 return str_replace(str, str2); 03939 } 03940 03941 /* 03942 * call-seq: 03943 * string.clear -> string 03944 * 03945 * Makes string empty. 03946 * 03947 * a = "abcde" 03948 * a.clear #=> "" 03949 */ 03950 03951 static VALUE 03952 rb_str_clear(VALUE str) 03953 { 03954 str_discard(str); 03955 STR_SET_EMBED(str); 03956 STR_SET_EMBED_LEN(str, 0); 03957 RSTRING_PTR(str)[0] = 0; 03958 if (rb_enc_asciicompat(STR_ENC_GET(str))) 03959 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 03960 else 03961 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 03962 return str; 03963 } 03964 03965 /* 03966 * call-seq: 03967 * string.chr -> string 03968 * 03969 * Returns a one-character string at the beginning of the string. 03970 * 03971 * a = "abcde" 03972 * a.chr #=> "a" 03973 */ 03974 03975 static VALUE 03976 rb_str_chr(VALUE str) 03977 { 03978 return rb_str_substr(str, 0, 1); 03979 } 03980 03981 /* 03982 * call-seq: 03983 * str.getbyte(index) -> 0 .. 255 03984 * 03985 * returns the <i>index</i>th byte as an integer. 03986 */ 03987 static VALUE 03988 rb_str_getbyte(VALUE str, VALUE index) 03989 { 03990 long pos = NUM2LONG(index); 03991 03992 if (pos < 0) 03993 pos += RSTRING_LEN(str); 03994 if (pos < 0 || RSTRING_LEN(str) <= pos) 03995 return Qnil; 03996 03997 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]); 03998 } 03999 04000 /* 04001 * call-seq: 04002 * str.setbyte(index, int) -> int 04003 * 04004 * modifies the <i>index</i>th byte as <i>int</i>. 04005 */ 04006 static VALUE 04007 rb_str_setbyte(VALUE str, VALUE index, VALUE value) 04008 { 04009 long pos = NUM2LONG(index); 04010 int byte = NUM2INT(value); 04011 04012 rb_str_modify(str); 04013 04014 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos) 04015 rb_raise(rb_eIndexError, "index %ld out of string", pos); 04016 if (pos < 0) 04017 pos += RSTRING_LEN(str); 04018 04019 RSTRING_PTR(str)[pos] = byte; 04020 04021 return value; 04022 } 04023 04024 static VALUE 04025 str_byte_substr(VALUE str, long beg, long len) 04026 { 04027 char *p, *s = RSTRING_PTR(str); 04028 long n = RSTRING_LEN(str); 04029 VALUE str2; 04030 04031 if (beg > n || len < 0) return Qnil; 04032 if (beg < 0) { 04033 beg += n; 04034 if (beg < 0) return Qnil; 04035 } 04036 if (beg + len > n) 04037 len = n - beg; 04038 if (len <= 0) { 04039 len = 0; 04040 p = 0; 04041 } 04042 else 04043 p = s + beg; 04044 04045 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) { 04046 str2 = rb_str_new4(str); 04047 str2 = str_new3(rb_obj_class(str2), str2); 04048 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len; 04049 RSTRING(str2)->as.heap.len = len; 04050 } 04051 else { 04052 str2 = rb_str_new5(str, p, len); 04053 } 04054 04055 str_enc_copy(str2, str); 04056 04057 if (RSTRING_LEN(str2) == 0) { 04058 if (!rb_enc_asciicompat(STR_ENC_GET(str))) 04059 ENC_CODERANGE_SET(str2, ENC_CODERANGE_VALID); 04060 else 04061 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 04062 } 04063 else { 04064 switch (ENC_CODERANGE(str)) { 04065 case ENC_CODERANGE_7BIT: 04066 ENC_CODERANGE_SET(str2, ENC_CODERANGE_7BIT); 04067 break; 04068 default: 04069 ENC_CODERANGE_SET(str2, ENC_CODERANGE_UNKNOWN); 04070 break; 04071 } 04072 } 04073 04074 OBJ_INFECT(str2, str); 04075 04076 return str2; 04077 } 04078 04079 static VALUE 04080 str_byte_aref(VALUE str, VALUE indx) 04081 { 04082 long idx; 04083 switch (TYPE(indx)) { 04084 case T_FIXNUM: 04085 idx = FIX2LONG(indx); 04086 04087 num_index: 04088 str = str_byte_substr(str, idx, 1); 04089 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil; 04090 return str; 04091 04092 default: 04093 /* check if indx is Range */ 04094 { 04095 long beg, len = RSTRING_LEN(str); 04096 04097 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { 04098 case Qfalse: 04099 break; 04100 case Qnil: 04101 return Qnil; 04102 default: 04103 return str_byte_substr(str, beg, len); 04104 } 04105 } 04106 idx = NUM2LONG(indx); 04107 goto num_index; 04108 } 04109 return Qnil; /* not reached */ 04110 } 04111 04112 /* 04113 * call-seq: 04114 * str.byteslice(fixnum) -> new_str or nil 04115 * str.byteslice(fixnum, fixnum) -> new_str or nil 04116 * str.byteslice(range) -> new_str or nil 04117 * 04118 * Byte Reference---If passed a single <code>Fixnum</code>, returns a 04119 * substring of one byte at that position. If passed two <code>Fixnum</code> 04120 * objects, returns a substring starting at the offset given by the first, and 04121 * a length given by the second. If given a <code>Range</code>, a substring containing 04122 * bytes at offsets given by the range is returned. In all three cases, if 04123 * an offset is negative, it is counted from the end of <i>str</i>. Returns 04124 * <code>nil</code> if the initial offset falls outside the string, the length 04125 * is negative, or the beginning of the range is greater than the end. 04126 * The encoding of the resulted string keeps original encoding. 04127 * 04128 * "hello".byteslice(1) #=> "e" 04129 * "hello".byteslice(-1) #=> "o" 04130 * "hello".byteslice(1, 2) #=> "el" 04131 * "\x80\u3042".byteslice(1, 3) #=> "\u3042" 04132 * "\x03\u3042\xff".byteslice(1..3) #=> "\u3942" 04133 */ 04134 04135 static VALUE 04136 rb_str_byteslice(int argc, VALUE *argv, VALUE str) 04137 { 04138 if (argc == 2) { 04139 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1])); 04140 } 04141 if (argc != 1) { 04142 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); 04143 } 04144 return str_byte_aref(str, argv[0]); 04145 } 04146 04147 /* 04148 * call-seq: 04149 * str.reverse -> new_str 04150 * 04151 * Returns a new string with the characters from <i>str</i> in reverse order. 04152 * 04153 * "stressed".reverse #=> "desserts" 04154 */ 04155 04156 static VALUE 04157 rb_str_reverse(VALUE str) 04158 { 04159 rb_encoding *enc; 04160 VALUE rev; 04161 char *s, *e, *p; 04162 int single = 1; 04163 04164 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str); 04165 enc = STR_ENC_GET(str); 04166 rev = rb_str_new5(str, 0, RSTRING_LEN(str)); 04167 s = RSTRING_PTR(str); e = RSTRING_END(str); 04168 p = RSTRING_END(rev); 04169 04170 if (RSTRING_LEN(str) > 1) { 04171 if (single_byte_optimizable(str)) { 04172 while (s < e) { 04173 *--p = *s++; 04174 } 04175 } 04176 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { 04177 while (s < e) { 04178 int clen = rb_enc_fast_mbclen(s, e, enc); 04179 04180 if (clen > 1 || (*s & 0x80)) single = 0; 04181 p -= clen; 04182 memcpy(p, s, clen); 04183 s += clen; 04184 } 04185 } 04186 else { 04187 while (s < e) { 04188 int clen = rb_enc_mbclen(s, e, enc); 04189 04190 if (clen > 1 || (*s & 0x80)) single = 0; 04191 p -= clen; 04192 memcpy(p, s, clen); 04193 s += clen; 04194 } 04195 } 04196 } 04197 STR_SET_LEN(rev, RSTRING_LEN(str)); 04198 OBJ_INFECT(rev, str); 04199 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) { 04200 if (single) { 04201 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); 04202 } 04203 else { 04204 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); 04205 } 04206 } 04207 rb_enc_cr_str_copy_for_substr(rev, str); 04208 04209 return rev; 04210 } 04211 04212 04213 /* 04214 * call-seq: 04215 * str.reverse! -> str 04216 * 04217 * Reverses <i>str</i> in place. 04218 */ 04219 04220 static VALUE 04221 rb_str_reverse_bang(VALUE str) 04222 { 04223 if (RSTRING_LEN(str) > 1) { 04224 if (single_byte_optimizable(str)) { 04225 char *s, *e, c; 04226 04227 str_modify_keep_cr(str); 04228 s = RSTRING_PTR(str); 04229 e = RSTRING_END(str) - 1; 04230 while (s < e) { 04231 c = *s; 04232 *s++ = *e; 04233 *e-- = c; 04234 } 04235 } 04236 else { 04237 rb_str_shared_replace(str, rb_str_reverse(str)); 04238 } 04239 } 04240 else { 04241 str_modify_keep_cr(str); 04242 } 04243 return str; 04244 } 04245 04246 04247 /* 04248 * call-seq: 04249 * str.include? other_str -> true or false 04250 * 04251 * Returns <code>true</code> if <i>str</i> contains the given string or 04252 * character. 04253 * 04254 * "hello".include? "lo" #=> true 04255 * "hello".include? "ol" #=> false 04256 * "hello".include? ?h #=> true 04257 */ 04258 04259 static VALUE 04260 rb_str_include(VALUE str, VALUE arg) 04261 { 04262 long i; 04263 04264 StringValue(arg); 04265 i = rb_str_index(str, arg, 0); 04266 04267 if (i == -1) return Qfalse; 04268 return Qtrue; 04269 } 04270 04271 04272 /* 04273 * call-seq: 04274 * str.to_i(base=10) -> integer 04275 * 04276 * Returns the result of interpreting leading characters in <i>str</i> as an 04277 * integer base <i>base</i> (between 2 and 36). Extraneous characters past the 04278 * end of a valid number are ignored. If there is not a valid number at the 04279 * start of <i>str</i>, <code>0</code> is returned. This method never raises an 04280 * exception when <i>base</i> is valid. 04281 * 04282 * "12345".to_i #=> 12345 04283 * "99 red balloons".to_i #=> 99 04284 * "0a".to_i #=> 0 04285 * "0a".to_i(16) #=> 10 04286 * "hello".to_i #=> 0 04287 * "1100101".to_i(2) #=> 101 04288 * "1100101".to_i(8) #=> 294977 04289 * "1100101".to_i(10) #=> 1100101 04290 * "1100101".to_i(16) #=> 17826049 04291 */ 04292 04293 static VALUE 04294 rb_str_to_i(int argc, VALUE *argv, VALUE str) 04295 { 04296 int base; 04297 04298 if (argc == 0) base = 10; 04299 else { 04300 VALUE b; 04301 04302 rb_scan_args(argc, argv, "01", &b); 04303 base = NUM2INT(b); 04304 } 04305 if (base < 0) { 04306 rb_raise(rb_eArgError, "invalid radix %d", base); 04307 } 04308 return rb_str_to_inum(str, base, FALSE); 04309 } 04310 04311 04312 /* 04313 * call-seq: 04314 * str.to_f -> float 04315 * 04316 * Returns the result of interpreting leading characters in <i>str</i> as a 04317 * floating point number. Extraneous characters past the end of a valid number 04318 * are ignored. If there is not a valid number at the start of <i>str</i>, 04319 * <code>0.0</code> is returned. This method never raises an exception. 04320 * 04321 * "123.45e1".to_f #=> 1234.5 04322 * "45.67 degrees".to_f #=> 45.67 04323 * "thx1138".to_f #=> 0.0 04324 */ 04325 04326 static VALUE 04327 rb_str_to_f(VALUE str) 04328 { 04329 return DBL2NUM(rb_str_to_dbl(str, FALSE)); 04330 } 04331 04332 04333 /* 04334 * call-seq: 04335 * str.to_s -> str 04336 * str.to_str -> str 04337 * 04338 * Returns the receiver. 04339 */ 04340 04341 static VALUE 04342 rb_str_to_s(VALUE str) 04343 { 04344 if (rb_obj_class(str) != rb_cString) { 04345 return str_duplicate(rb_cString, str); 04346 } 04347 return str; 04348 } 04349 04350 #if 0 04351 static void 04352 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc) 04353 { 04354 char s[RUBY_MAX_CHAR_LEN]; 04355 int n = rb_enc_codelen(c, enc); 04356 04357 rb_enc_mbcput(c, s, enc); 04358 rb_enc_str_buf_cat(str, s, n, enc); 04359 } 04360 #endif 04361 04362 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */ 04363 04364 int 04365 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) 04366 { 04367 char buf[CHAR_ESC_LEN + 1]; 04368 int l; 04369 04370 #if SIZEOF_INT > 4 04371 c &= 0xffffffff; 04372 #endif 04373 if (unicode_p) { 04374 if (c < 0x7F && ISPRINT(c)) { 04375 snprintf(buf, CHAR_ESC_LEN, "%c", c); 04376 } 04377 else if (c < 0x10000) { 04378 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c); 04379 } 04380 else { 04381 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c); 04382 } 04383 } 04384 else { 04385 if (c < 0x100) { 04386 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c); 04387 } 04388 else { 04389 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c); 04390 } 04391 } 04392 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */ 04393 rb_str_buf_cat(result, buf, l); 04394 return l; 04395 } 04396 04397 /* 04398 * call-seq: 04399 * str.inspect -> string 04400 * 04401 * Returns a printable version of _str_, surrounded by quote marks, 04402 * with special characters escaped. 04403 * 04404 * str = "hello" 04405 * str[3] = "\b" 04406 * str.inspect #=> "\"hel\\bo\"" 04407 */ 04408 04409 VALUE 04410 rb_str_inspect(VALUE str) 04411 { 04412 rb_encoding *enc = STR_ENC_GET(str); 04413 const char *p, *pend, *prev; 04414 char buf[CHAR_ESC_LEN + 1]; 04415 VALUE result = rb_str_buf_new(0); 04416 rb_encoding *resenc = rb_default_internal_encoding(); 04417 int unicode_p = rb_enc_unicode_p(enc); 04418 int asciicompat = rb_enc_asciicompat(enc); 04419 static rb_encoding *utf16, *utf32; 04420 04421 if (!utf16) utf16 = rb_enc_find("UTF-16"); 04422 if (!utf32) utf32 = rb_enc_find("UTF-32"); 04423 if (resenc == NULL) resenc = rb_default_external_encoding(); 04424 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding(); 04425 rb_enc_associate(result, resenc); 04426 str_buf_cat2(result, "\""); 04427 04428 p = RSTRING_PTR(str); pend = RSTRING_END(str); 04429 prev = p; 04430 if (enc == utf16) { 04431 const unsigned char *q = (const unsigned char *)p; 04432 if (q[0] == 0xFE && q[1] == 0xFF) 04433 enc = rb_enc_find("UTF-16BE"); 04434 else if (q[0] == 0xFF && q[1] == 0xFE) 04435 enc = rb_enc_find("UTF-16LE"); 04436 else 04437 unicode_p = 0; 04438 } 04439 else if (enc == utf32) { 04440 const unsigned char *q = (const unsigned char *)p; 04441 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) 04442 enc = rb_enc_find("UTF-32BE"); 04443 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) 04444 enc = rb_enc_find("UTF-32LE"); 04445 else 04446 unicode_p = 0; 04447 } 04448 while (p < pend) { 04449 unsigned int c, cc; 04450 int n; 04451 04452 n = rb_enc_precise_mbclen(p, pend, enc); 04453 if (!MBCLEN_CHARFOUND_P(n)) { 04454 if (p > prev) str_buf_cat(result, prev, p - prev); 04455 n = rb_enc_mbminlen(enc); 04456 if (pend < p + n) 04457 n = (int)(pend - p); 04458 while (n--) { 04459 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377); 04460 str_buf_cat(result, buf, strlen(buf)); 04461 prev = ++p; 04462 } 04463 continue; 04464 } 04465 n = MBCLEN_CHARFOUND_LEN(n); 04466 c = rb_enc_mbc_to_codepoint(p, pend, enc); 04467 p += n; 04468 if ((asciicompat || unicode_p) && 04469 (c == '"'|| c == '\\' || 04470 (c == '#' && 04471 p < pend && 04472 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) && 04473 (cc = rb_enc_codepoint(p,pend,enc), 04474 (cc == '$' || cc == '@' || cc == '{'))))) { 04475 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04476 str_buf_cat2(result, "\\"); 04477 if (asciicompat || enc == resenc) { 04478 prev = p - n; 04479 continue; 04480 } 04481 } 04482 switch (c) { 04483 case '\n': cc = 'n'; break; 04484 case '\r': cc = 'r'; break; 04485 case '\t': cc = 't'; break; 04486 case '\f': cc = 'f'; break; 04487 case '\013': cc = 'v'; break; 04488 case '\010': cc = 'b'; break; 04489 case '\007': cc = 'a'; break; 04490 case 033: cc = 'e'; break; 04491 default: cc = 0; break; 04492 } 04493 if (cc) { 04494 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04495 buf[0] = '\\'; 04496 buf[1] = (char)cc; 04497 str_buf_cat(result, buf, 2); 04498 prev = p; 04499 continue; 04500 } 04501 if ((enc == resenc && rb_enc_isprint(c, enc)) || 04502 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) { 04503 continue; 04504 } 04505 else { 04506 if (p - n > prev) str_buf_cat(result, prev, p - n - prev); 04507 rb_str_buf_cat_escaped_char(result, c, unicode_p); 04508 prev = p; 04509 continue; 04510 } 04511 } 04512 if (p > prev) str_buf_cat(result, prev, p - prev); 04513 str_buf_cat2(result, "\""); 04514 04515 OBJ_INFECT(result, str); 04516 return result; 04517 } 04518 04519 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) 04520 04521 /* 04522 * call-seq: 04523 * str.dump -> new_str 04524 * 04525 * Produces a version of <i>str</i> with all nonprinting characters replaced by 04526 * <code>\nnn</code> notation and all special characters escaped. 04527 */ 04528 04529 VALUE 04530 rb_str_dump(VALUE str) 04531 { 04532 rb_encoding *enc = rb_enc_get(str); 04533 long len; 04534 const char *p, *pend; 04535 char *q, *qend; 04536 VALUE result; 04537 int u8 = (enc == rb_utf8_encoding()); 04538 04539 len = 2; /* "" */ 04540 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04541 while (p < pend) { 04542 unsigned char c = *p++; 04543 switch (c) { 04544 case '"': case '\\': 04545 case '\n': case '\r': 04546 case '\t': case '\f': 04547 case '\013': case '\010': case '\007': case '\033': 04548 len += 2; 04549 break; 04550 04551 case '#': 04552 len += IS_EVSTR(p, pend) ? 2 : 1; 04553 break; 04554 04555 default: 04556 if (ISPRINT(c)) { 04557 len++; 04558 } 04559 else { 04560 if (u8) { /* \u{NN} */ 04561 int n = rb_enc_precise_mbclen(p-1, pend, enc); 04562 if (MBCLEN_CHARFOUND_P(n-1)) { 04563 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04564 while (cc >>= 4) len++; 04565 len += 5; 04566 p += MBCLEN_CHARFOUND_LEN(n)-1; 04567 break; 04568 } 04569 } 04570 len += 4; /* \xNN */ 04571 } 04572 break; 04573 } 04574 } 04575 if (!rb_enc_asciicompat(enc)) { 04576 len += 19; /* ".force_encoding('')" */ 04577 len += strlen(enc->name); 04578 } 04579 04580 result = rb_str_new5(str, 0, len); 04581 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); 04582 q = RSTRING_PTR(result); qend = q + len + 1; 04583 04584 *q++ = '"'; 04585 while (p < pend) { 04586 unsigned char c = *p++; 04587 04588 if (c == '"' || c == '\\') { 04589 *q++ = '\\'; 04590 *q++ = c; 04591 } 04592 else if (c == '#') { 04593 if (IS_EVSTR(p, pend)) *q++ = '\\'; 04594 *q++ = '#'; 04595 } 04596 else if (c == '\n') { 04597 *q++ = '\\'; 04598 *q++ = 'n'; 04599 } 04600 else if (c == '\r') { 04601 *q++ = '\\'; 04602 *q++ = 'r'; 04603 } 04604 else if (c == '\t') { 04605 *q++ = '\\'; 04606 *q++ = 't'; 04607 } 04608 else if (c == '\f') { 04609 *q++ = '\\'; 04610 *q++ = 'f'; 04611 } 04612 else if (c == '\013') { 04613 *q++ = '\\'; 04614 *q++ = 'v'; 04615 } 04616 else if (c == '\010') { 04617 *q++ = '\\'; 04618 *q++ = 'b'; 04619 } 04620 else if (c == '\007') { 04621 *q++ = '\\'; 04622 *q++ = 'a'; 04623 } 04624 else if (c == '\033') { 04625 *q++ = '\\'; 04626 *q++ = 'e'; 04627 } 04628 else if (ISPRINT(c)) { 04629 *q++ = c; 04630 } 04631 else { 04632 *q++ = '\\'; 04633 if (u8) { 04634 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1; 04635 if (MBCLEN_CHARFOUND_P(n)) { 04636 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc); 04637 p += n; 04638 snprintf(q, qend-q, "u{%x}", cc); 04639 q += strlen(q); 04640 continue; 04641 } 04642 } 04643 snprintf(q, qend-q, "x%02X", c); 04644 q += 3; 04645 } 04646 } 04647 *q++ = '"'; 04648 *q = '\0'; 04649 if (!rb_enc_asciicompat(enc)) { 04650 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name); 04651 enc = rb_ascii8bit_encoding(); 04652 } 04653 OBJ_INFECT(result, str); 04654 /* result from dump is ASCII */ 04655 rb_enc_associate(result, enc); 04656 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT); 04657 return result; 04658 } 04659 04660 04661 static void 04662 rb_str_check_dummy_enc(rb_encoding *enc) 04663 { 04664 if (rb_enc_dummy_p(enc)) { 04665 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s", 04666 rb_enc_name(enc)); 04667 } 04668 } 04669 04670 /* 04671 * call-seq: 04672 * str.upcase! -> str or nil 04673 * 04674 * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes 04675 * were made. 04676 * Note: case replacement is effective only in ASCII region. 04677 */ 04678 04679 static VALUE 04680 rb_str_upcase_bang(VALUE str) 04681 { 04682 rb_encoding *enc; 04683 char *s, *send; 04684 int modify = 0; 04685 int n; 04686 04687 str_modify_keep_cr(str); 04688 enc = STR_ENC_GET(str); 04689 rb_str_check_dummy_enc(enc); 04690 s = RSTRING_PTR(str); send = RSTRING_END(str); 04691 if (single_byte_optimizable(str)) { 04692 while (s < send) { 04693 unsigned int c = *(unsigned char*)s; 04694 04695 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04696 *s = 'A' + (c - 'a'); 04697 modify = 1; 04698 } 04699 s++; 04700 } 04701 } 04702 else { 04703 int ascompat = rb_enc_asciicompat(enc); 04704 04705 while (s < send) { 04706 unsigned int c; 04707 04708 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04709 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') { 04710 *s = 'A' + (c - 'a'); 04711 modify = 1; 04712 } 04713 s++; 04714 } 04715 else { 04716 c = rb_enc_codepoint_len(s, send, &n, enc); 04717 if (rb_enc_islower(c, enc)) { 04718 /* assuming toupper returns codepoint with same size */ 04719 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04720 modify = 1; 04721 } 04722 s += n; 04723 } 04724 } 04725 } 04726 04727 if (modify) return str; 04728 return Qnil; 04729 } 04730 04731 04732 /* 04733 * call-seq: 04734 * str.upcase -> new_str 04735 * 04736 * Returns a copy of <i>str</i> with all lowercase letters replaced with their 04737 * uppercase counterparts. The operation is locale insensitive---only 04738 * characters ``a'' to ``z'' are affected. 04739 * Note: case replacement is effective only in ASCII region. 04740 * 04741 * "hEllO".upcase #=> "HELLO" 04742 */ 04743 04744 static VALUE 04745 rb_str_upcase(VALUE str) 04746 { 04747 str = rb_str_dup(str); 04748 rb_str_upcase_bang(str); 04749 return str; 04750 } 04751 04752 04753 /* 04754 * call-seq: 04755 * str.downcase! -> str or nil 04756 * 04757 * Downcases the contents of <i>str</i>, returning <code>nil</code> if no 04758 * changes were made. 04759 * Note: case replacement is effective only in ASCII region. 04760 */ 04761 04762 static VALUE 04763 rb_str_downcase_bang(VALUE str) 04764 { 04765 rb_encoding *enc; 04766 char *s, *send; 04767 int modify = 0; 04768 04769 str_modify_keep_cr(str); 04770 enc = STR_ENC_GET(str); 04771 rb_str_check_dummy_enc(enc); 04772 s = RSTRING_PTR(str); send = RSTRING_END(str); 04773 if (single_byte_optimizable(str)) { 04774 while (s < send) { 04775 unsigned int c = *(unsigned char*)s; 04776 04777 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04778 *s = 'a' + (c - 'A'); 04779 modify = 1; 04780 } 04781 s++; 04782 } 04783 } 04784 else { 04785 int ascompat = rb_enc_asciicompat(enc); 04786 04787 while (s < send) { 04788 unsigned int c; 04789 int n; 04790 04791 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 04792 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') { 04793 *s = 'a' + (c - 'A'); 04794 modify = 1; 04795 } 04796 s++; 04797 } 04798 else { 04799 c = rb_enc_codepoint_len(s, send, &n, enc); 04800 if (rb_enc_isupper(c, enc)) { 04801 /* assuming toupper returns codepoint with same size */ 04802 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04803 modify = 1; 04804 } 04805 s += n; 04806 } 04807 } 04808 } 04809 04810 if (modify) return str; 04811 return Qnil; 04812 } 04813 04814 04815 /* 04816 * call-seq: 04817 * str.downcase -> new_str 04818 * 04819 * Returns a copy of <i>str</i> with all uppercase letters replaced with their 04820 * lowercase counterparts. The operation is locale insensitive---only 04821 * characters ``A'' to ``Z'' are affected. 04822 * Note: case replacement is effective only in ASCII region. 04823 * 04824 * "hEllO".downcase #=> "hello" 04825 */ 04826 04827 static VALUE 04828 rb_str_downcase(VALUE str) 04829 { 04830 str = rb_str_dup(str); 04831 rb_str_downcase_bang(str); 04832 return str; 04833 } 04834 04835 04836 /* 04837 * call-seq: 04838 * str.capitalize! -> str or nil 04839 * 04840 * Modifies <i>str</i> by converting the first character to uppercase and the 04841 * remainder to lowercase. Returns <code>nil</code> if no changes are made. 04842 * Note: case conversion is effective only in ASCII region. 04843 * 04844 * a = "hello" 04845 * a.capitalize! #=> "Hello" 04846 * a #=> "Hello" 04847 * a.capitalize! #=> nil 04848 */ 04849 04850 static VALUE 04851 rb_str_capitalize_bang(VALUE str) 04852 { 04853 rb_encoding *enc; 04854 char *s, *send; 04855 int modify = 0; 04856 unsigned int c; 04857 int n; 04858 04859 str_modify_keep_cr(str); 04860 enc = STR_ENC_GET(str); 04861 rb_str_check_dummy_enc(enc); 04862 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 04863 s = RSTRING_PTR(str); send = RSTRING_END(str); 04864 04865 c = rb_enc_codepoint_len(s, send, &n, enc); 04866 if (rb_enc_islower(c, enc)) { 04867 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04868 modify = 1; 04869 } 04870 s += n; 04871 while (s < send) { 04872 c = rb_enc_codepoint_len(s, send, &n, enc); 04873 if (rb_enc_isupper(c, enc)) { 04874 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04875 modify = 1; 04876 } 04877 s += n; 04878 } 04879 04880 if (modify) return str; 04881 return Qnil; 04882 } 04883 04884 04885 /* 04886 * call-seq: 04887 * str.capitalize -> new_str 04888 * 04889 * Returns a copy of <i>str</i> with the first character converted to uppercase 04890 * and the remainder to lowercase. 04891 * Note: case conversion is effective only in ASCII region. 04892 * 04893 * "hello".capitalize #=> "Hello" 04894 * "HELLO".capitalize #=> "Hello" 04895 * "123ABC".capitalize #=> "123abc" 04896 */ 04897 04898 static VALUE 04899 rb_str_capitalize(VALUE str) 04900 { 04901 str = rb_str_dup(str); 04902 rb_str_capitalize_bang(str); 04903 return str; 04904 } 04905 04906 04907 /* 04908 * call-seq: 04909 * str.swapcase! -> str or nil 04910 * 04911 * Equivalent to <code>String#swapcase</code>, but modifies the receiver in 04912 * place, returning <i>str</i>, or <code>nil</code> if no changes were made. 04913 * Note: case conversion is effective only in ASCII region. 04914 */ 04915 04916 static VALUE 04917 rb_str_swapcase_bang(VALUE str) 04918 { 04919 rb_encoding *enc; 04920 char *s, *send; 04921 int modify = 0; 04922 int n; 04923 04924 str_modify_keep_cr(str); 04925 enc = STR_ENC_GET(str); 04926 rb_str_check_dummy_enc(enc); 04927 s = RSTRING_PTR(str); send = RSTRING_END(str); 04928 while (s < send) { 04929 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc); 04930 04931 if (rb_enc_isupper(c, enc)) { 04932 /* assuming toupper returns codepoint with same size */ 04933 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); 04934 modify = 1; 04935 } 04936 else if (rb_enc_islower(c, enc)) { 04937 /* assuming tolower returns codepoint with same size */ 04938 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); 04939 modify = 1; 04940 } 04941 s += n; 04942 } 04943 04944 if (modify) return str; 04945 return Qnil; 04946 } 04947 04948 04949 /* 04950 * call-seq: 04951 * str.swapcase -> new_str 04952 * 04953 * Returns a copy of <i>str</i> with uppercase alphabetic characters converted 04954 * to lowercase and lowercase characters converted to uppercase. 04955 * Note: case conversion is effective only in ASCII region. 04956 * 04957 * "Hello".swapcase #=> "hELLO" 04958 * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11" 04959 */ 04960 04961 static VALUE 04962 rb_str_swapcase(VALUE str) 04963 { 04964 str = rb_str_dup(str); 04965 rb_str_swapcase_bang(str); 04966 return str; 04967 } 04968 04969 typedef unsigned char *USTR; 04970 04971 struct tr { 04972 int gen; 04973 unsigned int now, max; 04974 char *p, *pend; 04975 }; 04976 04977 static unsigned int 04978 trnext(struct tr *t, rb_encoding *enc) 04979 { 04980 int n; 04981 04982 for (;;) { 04983 if (!t->gen) { 04984 if (t->p == t->pend) return -1; 04985 if (t->p < t->pend - 1 && *t->p == '\\') { 04986 t->p++; 04987 } 04988 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 04989 t->p += n; 04990 if (t->p < t->pend - 1 && *t->p == '-') { 04991 t->p++; 04992 if (t->p < t->pend) { 04993 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc); 04994 t->p += n; 04995 if (t->now > c) { 04996 if (t->now < 0x80 && c < 0x80) { 04997 rb_raise(rb_eArgError, 04998 "invalid range \"%c-%c\" in string transliteration", 04999 t->now, c); 05000 } 05001 else { 05002 rb_raise(rb_eArgError, "invalid range in string transliteration"); 05003 } 05004 continue; /* not reached */ 05005 } 05006 t->gen = 1; 05007 t->max = c; 05008 } 05009 } 05010 return t->now; 05011 } 05012 else if (++t->now < t->max) { 05013 return t->now; 05014 } 05015 else { 05016 t->gen = 0; 05017 return t->max; 05018 } 05019 } 05020 } 05021 05022 static VALUE rb_str_delete_bang(int,VALUE*,VALUE); 05023 05024 static VALUE 05025 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) 05026 { 05027 const unsigned int errc = -1; 05028 unsigned int trans[256]; 05029 rb_encoding *enc, *e1, *e2; 05030 struct tr trsrc, trrepl; 05031 int cflag = 0; 05032 unsigned int c, c0, last = 0; 05033 int modify = 0, i, l; 05034 char *s, *send; 05035 VALUE hash = 0; 05036 int singlebyte = single_byte_optimizable(str); 05037 int cr; 05038 05039 #define CHECK_IF_ASCII(c) \ 05040 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \ 05041 (cr = ENC_CODERANGE_VALID) : 0) 05042 05043 StringValue(src); 05044 StringValue(repl); 05045 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05046 if (RSTRING_LEN(repl) == 0) { 05047 return rb_str_delete_bang(1, &src, str); 05048 } 05049 05050 cr = ENC_CODERANGE(str); 05051 e1 = rb_enc_check(str, src); 05052 e2 = rb_enc_check(str, repl); 05053 if (e1 == e2) { 05054 enc = e1; 05055 } 05056 else { 05057 enc = rb_enc_check(src, repl); 05058 } 05059 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src); 05060 if (RSTRING_LEN(src) > 1 && 05061 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' && 05062 trsrc.p + l < trsrc.pend) { 05063 cflag = 1; 05064 trsrc.p += l; 05065 } 05066 trrepl.p = RSTRING_PTR(repl); 05067 trrepl.pend = trrepl.p + RSTRING_LEN(repl); 05068 trsrc.gen = trrepl.gen = 0; 05069 trsrc.now = trrepl.now = 0; 05070 trsrc.max = trrepl.max = 0; 05071 05072 if (cflag) { 05073 for (i=0; i<256; i++) { 05074 trans[i] = 1; 05075 } 05076 while ((c = trnext(&trsrc, enc)) != errc) { 05077 if (c < 256) { 05078 trans[c] = errc; 05079 } 05080 else { 05081 if (!hash) hash = rb_hash_new(); 05082 rb_hash_aset(hash, UINT2NUM(c), Qtrue); 05083 } 05084 } 05085 while ((c = trnext(&trrepl, enc)) != errc) 05086 /* retrieve last replacer */; 05087 last = trrepl.now; 05088 for (i=0; i<256; i++) { 05089 if (trans[i] != errc) { 05090 trans[i] = last; 05091 } 05092 } 05093 } 05094 else { 05095 unsigned int r; 05096 05097 for (i=0; i<256; i++) { 05098 trans[i] = errc; 05099 } 05100 while ((c = trnext(&trsrc, enc)) != errc) { 05101 r = trnext(&trrepl, enc); 05102 if (r == errc) r = trrepl.now; 05103 if (c < 256) { 05104 trans[c] = r; 05105 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0; 05106 } 05107 else { 05108 if (!hash) hash = rb_hash_new(); 05109 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r)); 05110 } 05111 } 05112 } 05113 05114 if (cr == ENC_CODERANGE_VALID) 05115 cr = ENC_CODERANGE_7BIT; 05116 str_modify_keep_cr(str); 05117 s = RSTRING_PTR(str); send = RSTRING_END(str); 05118 if (sflag) { 05119 int clen, tlen; 05120 long offset, max = RSTRING_LEN(str); 05121 unsigned int save = -1; 05122 char *buf = ALLOC_N(char, max), *t = buf; 05123 05124 while (s < send) { 05125 int may_modify = 0; 05126 05127 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05128 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05129 05130 s += clen; 05131 if (c < 256) { 05132 c = trans[c]; 05133 } 05134 else if (hash) { 05135 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05136 if (NIL_P(tmp)) { 05137 if (cflag) c = last; 05138 else c = errc; 05139 } 05140 else if (cflag) c = errc; 05141 else c = NUM2INT(tmp); 05142 } 05143 else { 05144 c = errc; 05145 } 05146 if (c != (unsigned int)-1) { 05147 if (save == c) { 05148 CHECK_IF_ASCII(c); 05149 continue; 05150 } 05151 save = c; 05152 tlen = rb_enc_codelen(c, enc); 05153 modify = 1; 05154 } 05155 else { 05156 save = -1; 05157 c = c0; 05158 if (enc != e1) may_modify = 1; 05159 } 05160 while (t - buf + tlen >= max) { 05161 offset = t - buf; 05162 max *= 2; 05163 REALLOC_N(buf, char, max); 05164 t = buf + offset; 05165 } 05166 rb_enc_mbcput(c, t, enc); 05167 if (may_modify && memcmp(s, t, tlen) != 0) { 05168 modify = 1; 05169 } 05170 CHECK_IF_ASCII(c); 05171 t += tlen; 05172 } 05173 if (!STR_EMBED_P(str)) { 05174 xfree(RSTRING(str)->as.heap.ptr); 05175 } 05176 *t = '\0'; 05177 RSTRING(str)->as.heap.ptr = buf; 05178 RSTRING(str)->as.heap.len = t - buf; 05179 STR_SET_NOEMBED(str); 05180 RSTRING(str)->as.heap.aux.capa = max; 05181 } 05182 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) { 05183 while (s < send) { 05184 c = (unsigned char)*s; 05185 if (trans[c] != errc) { 05186 if (!cflag) { 05187 c = trans[c]; 05188 *s = c; 05189 modify = 1; 05190 } 05191 else { 05192 *s = last; 05193 modify = 1; 05194 } 05195 } 05196 CHECK_IF_ASCII(c); 05197 s++; 05198 } 05199 } 05200 else { 05201 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2); 05202 long offset; 05203 char *buf = ALLOC_N(char, max), *t = buf; 05204 05205 while (s < send) { 05206 int may_modify = 0; 05207 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1); 05208 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc); 05209 05210 if (c < 256) { 05211 c = trans[c]; 05212 } 05213 else if (hash) { 05214 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c)); 05215 if (NIL_P(tmp)) { 05216 if (cflag) c = last; 05217 else c = errc; 05218 } 05219 else if (cflag) c = errc; 05220 else c = NUM2INT(tmp); 05221 } 05222 else { 05223 c = cflag ? last : errc; 05224 } 05225 if (c != errc) { 05226 tlen = rb_enc_codelen(c, enc); 05227 modify = 1; 05228 } 05229 else { 05230 c = c0; 05231 if (enc != e1) may_modify = 1; 05232 } 05233 while (t - buf + tlen >= max) { 05234 offset = t - buf; 05235 max *= 2; 05236 REALLOC_N(buf, char, max); 05237 t = buf + offset; 05238 } 05239 if (s != t) { 05240 rb_enc_mbcput(c, t, enc); 05241 if (may_modify && memcmp(s, t, tlen) != 0) { 05242 modify = 1; 05243 } 05244 } 05245 CHECK_IF_ASCII(c); 05246 s += clen; 05247 t += tlen; 05248 } 05249 if (!STR_EMBED_P(str)) { 05250 xfree(RSTRING(str)->as.heap.ptr); 05251 } 05252 *t = '\0'; 05253 RSTRING(str)->as.heap.ptr = buf; 05254 RSTRING(str)->as.heap.len = t - buf; 05255 STR_SET_NOEMBED(str); 05256 RSTRING(str)->as.heap.aux.capa = max; 05257 } 05258 05259 if (modify) { 05260 if (cr != ENC_CODERANGE_BROKEN) 05261 ENC_CODERANGE_SET(str, cr); 05262 rb_enc_associate(str, enc); 05263 return str; 05264 } 05265 return Qnil; 05266 } 05267 05268 05269 /* 05270 * call-seq: 05271 * str.tr!(from_str, to_str) -> str or nil 05272 * 05273 * Translates <i>str</i> in place, using the same rules as 05274 * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no 05275 * changes were made. 05276 */ 05277 05278 static VALUE 05279 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl) 05280 { 05281 return tr_trans(str, src, repl, 0); 05282 } 05283 05284 05285 /* 05286 * call-seq: 05287 * str.tr(from_str, to_str) => new_str 05288 * 05289 * Returns a copy of <i>str</i> with the characters in <i>from_str</i> 05290 * replaced by the corresponding characters in <i>to_str</i>. If 05291 * <i>to_str</i> is shorter than <i>from_str</i>, it is padded with its last 05292 * character in order to maintain the correspondence. 05293 * 05294 * "hello".tr('el', 'ip') #=> "hippo" 05295 * "hello".tr('aeiou', '*') #=> "h*ll*" 05296 * 05297 * Both strings may use the c1-c2 notation to denote ranges of characters, 05298 * and <i>from_str</i> may start with a <code>^</code>, which denotes all 05299 * characters except those listed. 05300 * 05301 * "hello".tr('a-y', 'b-z') #=> "ifmmp" 05302 * "hello".tr('^aeiou', '*') #=> "*e**o" 05303 */ 05304 05305 static VALUE 05306 rb_str_tr(VALUE str, VALUE src, VALUE repl) 05307 { 05308 str = rb_str_dup(str); 05309 tr_trans(str, src, repl, 0); 05310 return str; 05311 } 05312 05313 #define TR_TABLE_SIZE 257 05314 static void 05315 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first, 05316 VALUE *tablep, VALUE *ctablep, rb_encoding *enc) 05317 { 05318 const unsigned int errc = -1; 05319 char buf[256]; 05320 struct tr tr; 05321 unsigned int c; 05322 VALUE table = 0, ptable = 0; 05323 int i, l, cflag = 0; 05324 05325 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str); 05326 tr.gen = tr.now = tr.max = 0; 05327 05328 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') { 05329 cflag = 1; 05330 tr.p += l; 05331 } 05332 if (first) { 05333 for (i=0; i<256; i++) { 05334 stable[i] = 1; 05335 } 05336 stable[256] = cflag; 05337 } 05338 else if (stable[256] && !cflag) { 05339 stable[256] = 0; 05340 } 05341 for (i=0; i<256; i++) { 05342 buf[i] = cflag; 05343 } 05344 05345 while ((c = trnext(&tr, enc)) != errc) { 05346 if (c < 256) { 05347 buf[c & 0xff] = !cflag; 05348 } 05349 else { 05350 VALUE key = UINT2NUM(c); 05351 05352 if (!table) { 05353 table = rb_hash_new(); 05354 if (cflag) { 05355 ptable = *ctablep; 05356 *ctablep = table; 05357 } 05358 else { 05359 ptable = *tablep; 05360 *tablep = table; 05361 } 05362 } 05363 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) { 05364 rb_hash_aset(table, key, Qtrue); 05365 } 05366 } 05367 } 05368 for (i=0; i<256; i++) { 05369 stable[i] = stable[i] && buf[i]; 05370 } 05371 } 05372 05373 05374 static int 05375 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel) 05376 { 05377 if (c < 256) { 05378 return table[c] != 0; 05379 } 05380 else { 05381 VALUE v = UINT2NUM(c); 05382 05383 if (del) { 05384 if (!NIL_P(rb_hash_lookup(del, v)) && 05385 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) { 05386 return TRUE; 05387 } 05388 } 05389 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) { 05390 return FALSE; 05391 } 05392 return table[256] ? TRUE : FALSE; 05393 } 05394 } 05395 05396 /* 05397 * call-seq: 05398 * str.delete!([other_str]+) -> str or nil 05399 * 05400 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or 05401 * <code>nil</code> if <i>str</i> was not modified. 05402 */ 05403 05404 static VALUE 05405 rb_str_delete_bang(int argc, VALUE *argv, VALUE str) 05406 { 05407 char squeez[TR_TABLE_SIZE]; 05408 rb_encoding *enc = 0; 05409 char *s, *send, *t; 05410 VALUE del = 0, nodel = 0; 05411 int modify = 0; 05412 int i, ascompat, cr; 05413 05414 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; 05415 if (argc < 1) { 05416 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)"); 05417 } 05418 for (i=0; i<argc; i++) { 05419 VALUE s = argv[i]; 05420 05421 StringValue(s); 05422 enc = rb_enc_check(str, s); 05423 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05424 } 05425 05426 str_modify_keep_cr(str); 05427 ascompat = rb_enc_asciicompat(enc); 05428 s = t = RSTRING_PTR(str); 05429 send = RSTRING_END(str); 05430 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID; 05431 while (s < send) { 05432 unsigned int c; 05433 int clen; 05434 05435 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05436 if (squeez[c]) { 05437 modify = 1; 05438 } 05439 else { 05440 if (t != s) *t = c; 05441 t++; 05442 } 05443 s++; 05444 } 05445 else { 05446 c = rb_enc_codepoint_len(s, send, &clen, enc); 05447 05448 if (tr_find(c, squeez, del, nodel)) { 05449 modify = 1; 05450 } 05451 else { 05452 if (t != s) rb_enc_mbcput(c, t, enc); 05453 t += clen; 05454 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID; 05455 } 05456 s += clen; 05457 } 05458 } 05459 *t = '\0'; 05460 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05461 ENC_CODERANGE_SET(str, cr); 05462 05463 if (modify) return str; 05464 return Qnil; 05465 } 05466 05467 05468 /* 05469 * call-seq: 05470 * str.delete([other_str]+) -> new_str 05471 * 05472 * Returns a copy of <i>str</i> with all characters in the intersection of its 05473 * arguments deleted. Uses the same rules for building the set of characters as 05474 * <code>String#count</code>. 05475 * 05476 * "hello".delete "l","lo" #=> "heo" 05477 * "hello".delete "lo" #=> "he" 05478 * "hello".delete "aeiou", "^e" #=> "hell" 05479 * "hello".delete "ej-m" #=> "ho" 05480 */ 05481 05482 static VALUE 05483 rb_str_delete(int argc, VALUE *argv, VALUE str) 05484 { 05485 str = rb_str_dup(str); 05486 rb_str_delete_bang(argc, argv, str); 05487 return str; 05488 } 05489 05490 05491 /* 05492 * call-seq: 05493 * str.squeeze!([other_str]*) -> str or nil 05494 * 05495 * Squeezes <i>str</i> in place, returning either <i>str</i>, or 05496 * <code>nil</code> if no changes were made. 05497 */ 05498 05499 static VALUE 05500 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) 05501 { 05502 char squeez[TR_TABLE_SIZE]; 05503 rb_encoding *enc = 0; 05504 VALUE del = 0, nodel = 0; 05505 char *s, *send, *t; 05506 int i, modify = 0; 05507 int ascompat, singlebyte = single_byte_optimizable(str); 05508 unsigned int save; 05509 05510 if (argc == 0) { 05511 enc = STR_ENC_GET(str); 05512 } 05513 else { 05514 for (i=0; i<argc; i++) { 05515 VALUE s = argv[i]; 05516 05517 StringValue(s); 05518 enc = rb_enc_check(str, s); 05519 if (singlebyte && !single_byte_optimizable(s)) 05520 singlebyte = 0; 05521 tr_setup_table(s, squeez, i==0, &del, &nodel, enc); 05522 } 05523 } 05524 05525 str_modify_keep_cr(str); 05526 s = t = RSTRING_PTR(str); 05527 if (!s || RSTRING_LEN(str) == 0) return Qnil; 05528 send = RSTRING_END(str); 05529 save = -1; 05530 ascompat = rb_enc_asciicompat(enc); 05531 05532 if (singlebyte) { 05533 while (s < send) { 05534 unsigned int c = *(unsigned char*)s++; 05535 if (c != save || (argc > 0 && !squeez[c])) { 05536 *t++ = save = c; 05537 } 05538 } 05539 } else { 05540 while (s < send) { 05541 unsigned int c; 05542 int clen; 05543 05544 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05545 if (c != save || (argc > 0 && !squeez[c])) { 05546 *t++ = save = c; 05547 } 05548 s++; 05549 } 05550 else { 05551 c = rb_enc_codepoint_len(s, send, &clen, enc); 05552 05553 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) { 05554 if (t != s) rb_enc_mbcput(c, t, enc); 05555 save = c; 05556 t += clen; 05557 } 05558 s += clen; 05559 } 05560 } 05561 } 05562 05563 *t = '\0'; 05564 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) { 05565 STR_SET_LEN(str, t - RSTRING_PTR(str)); 05566 modify = 1; 05567 } 05568 05569 if (modify) return str; 05570 return Qnil; 05571 } 05572 05573 05574 /* 05575 * call-seq: 05576 * str.squeeze([other_str]*) -> new_str 05577 * 05578 * Builds a set of characters from the <i>other_str</i> parameter(s) using the 05579 * procedure described for <code>String#count</code>. Returns a new string 05580 * where runs of the same character that occur in this set are replaced by a 05581 * single character. If no arguments are given, all runs of identical 05582 * characters are replaced by a single character. 05583 * 05584 * "yellow moon".squeeze #=> "yelow mon" 05585 * " now is the".squeeze(" ") #=> " now is the" 05586 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls" 05587 */ 05588 05589 static VALUE 05590 rb_str_squeeze(int argc, VALUE *argv, VALUE str) 05591 { 05592 str = rb_str_dup(str); 05593 rb_str_squeeze_bang(argc, argv, str); 05594 return str; 05595 } 05596 05597 05598 /* 05599 * call-seq: 05600 * str.tr_s!(from_str, to_str) -> str or nil 05601 * 05602 * Performs <code>String#tr_s</code> processing on <i>str</i> in place, 05603 * returning <i>str</i>, or <code>nil</code> if no changes were made. 05604 */ 05605 05606 static VALUE 05607 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl) 05608 { 05609 return tr_trans(str, src, repl, 1); 05610 } 05611 05612 05613 /* 05614 * call-seq: 05615 * str.tr_s(from_str, to_str) -> new_str 05616 * 05617 * Processes a copy of <i>str</i> as described under <code>String#tr</code>, 05618 * then removes duplicate characters in regions that were affected by the 05619 * translation. 05620 * 05621 * "hello".tr_s('l', 'r') #=> "hero" 05622 * "hello".tr_s('el', '*') #=> "h*o" 05623 * "hello".tr_s('el', 'hx') #=> "hhxo" 05624 */ 05625 05626 static VALUE 05627 rb_str_tr_s(VALUE str, VALUE src, VALUE repl) 05628 { 05629 str = rb_str_dup(str); 05630 tr_trans(str, src, repl, 1); 05631 return str; 05632 } 05633 05634 05635 /* 05636 * call-seq: 05637 * str.count([other_str]+) -> fixnum 05638 * 05639 * Each <i>other_str</i> parameter defines a set of characters to count. The 05640 * intersection of these sets defines the characters to count in 05641 * <i>str</i>. Any <i>other_str</i> that starts with a caret (^) is 05642 * negated. The sequence c1--c2 means all characters between c1 and c2. 05643 * 05644 * a = "hello world" 05645 * a.count "lo" #=> 5 05646 * a.count "lo", "o" #=> 2 05647 * a.count "hello", "^l" #=> 4 05648 * a.count "ej-m" #=> 4 05649 */ 05650 05651 static VALUE 05652 rb_str_count(int argc, VALUE *argv, VALUE str) 05653 { 05654 char table[TR_TABLE_SIZE]; 05655 rb_encoding *enc = 0; 05656 VALUE del = 0, nodel = 0; 05657 char *s, *send; 05658 int i; 05659 int ascompat; 05660 05661 if (argc < 1) { 05662 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)"); 05663 } 05664 for (i=0; i<argc; i++) { 05665 VALUE tstr = argv[i]; 05666 unsigned char c; 05667 05668 StringValue(tstr); 05669 enc = rb_enc_check(str, tstr); 05670 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) && 05671 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) { 05672 int n = 0; 05673 05674 s = RSTRING_PTR(str); 05675 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05676 send = RSTRING_END(str); 05677 while (s < send) { 05678 if (*(unsigned char*)s++ == c) n++; 05679 } 05680 return INT2NUM(n); 05681 } 05682 tr_setup_table(tstr, table, i==0, &del, &nodel, enc); 05683 } 05684 05685 s = RSTRING_PTR(str); 05686 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0); 05687 send = RSTRING_END(str); 05688 ascompat = rb_enc_asciicompat(enc); 05689 i = 0; 05690 while (s < send) { 05691 unsigned int c; 05692 05693 if (ascompat && (c = *(unsigned char*)s) < 0x80) { 05694 if (table[c]) { 05695 i++; 05696 } 05697 s++; 05698 } 05699 else { 05700 int clen; 05701 c = rb_enc_codepoint_len(s, send, &clen, enc); 05702 if (tr_find(c, table, del, nodel)) { 05703 i++; 05704 } 05705 s += clen; 05706 } 05707 } 05708 05709 return INT2NUM(i); 05710 } 05711 05712 static const char isspacetable[256] = { 05713 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 05714 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05715 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05716 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05718 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05719 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05723 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05724 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05725 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 05728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 05729 }; 05730 05731 #define ascii_isspace(c) isspacetable[(unsigned char)(c)] 05732 05733 /* 05734 * call-seq: 05735 * str.split(pattern=$;, [limit]) -> anArray 05736 * 05737 * Divides <i>str</i> into substrings based on a delimiter, returning an array 05738 * of these substrings. 05739 * 05740 * If <i>pattern</i> is a <code>String</code>, then its contents are used as 05741 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single 05742 * space, <i>str</i> is split on whitespace, with leading whitespace and runs 05743 * of contiguous whitespace characters ignored. 05744 * 05745 * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the 05746 * pattern matches. Whenever the pattern matches a zero-length string, 05747 * <i>str</i> is split into individual characters. If <i>pattern</i> contains 05748 * groups, the respective matches will be returned in the array as well. 05749 * 05750 * If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If 05751 * <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is 05752 * split on whitespace as if ` ' were specified. 05753 * 05754 * If the <i>limit</i> parameter is omitted, trailing null fields are 05755 * suppressed. If <i>limit</i> is a positive number, at most that number of 05756 * fields will be returned (if <i>limit</i> is <code>1</code>, the entire 05757 * string is returned as the only entry in an array). If negative, there is no 05758 * limit to the number of fields returned, and trailing null fields are not 05759 * suppressed. 05760 * 05761 * " now's the time".split #=> ["now's", "the", "time"] 05762 * " now's the time".split(' ') #=> ["now's", "the", "time"] 05763 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"] 05764 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"] 05765 * "hello".split(//) #=> ["h", "e", "l", "l", "o"] 05766 * "hello".split(//, 3) #=> ["h", "e", "llo"] 05767 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"] 05768 * 05769 * "mellow yellow".split("ello") #=> ["m", "w y", "w"] 05770 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"] 05771 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"] 05772 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""] 05773 */ 05774 05775 static VALUE 05776 rb_str_split_m(int argc, VALUE *argv, VALUE str) 05777 { 05778 rb_encoding *enc; 05779 VALUE spat; 05780 VALUE limit; 05781 enum {awk, string, regexp} split_type; 05782 long beg, end, i = 0; 05783 int lim = 0; 05784 VALUE result, tmp; 05785 05786 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) { 05787 lim = NUM2INT(limit); 05788 if (lim <= 0) limit = Qnil; 05789 else if (lim == 1) { 05790 if (RSTRING_LEN(str) == 0) 05791 return rb_ary_new2(0); 05792 return rb_ary_new3(1, str); 05793 } 05794 i = 1; 05795 } 05796 05797 enc = STR_ENC_GET(str); 05798 if (NIL_P(spat)) { 05799 if (!NIL_P(rb_fs)) { 05800 spat = rb_fs; 05801 goto fs_set; 05802 } 05803 split_type = awk; 05804 } 05805 else { 05806 fs_set: 05807 if (TYPE(spat) == T_STRING) { 05808 rb_encoding *enc2 = STR_ENC_GET(spat); 05809 05810 split_type = string; 05811 if (RSTRING_LEN(spat) == 0) { 05812 /* Special case - split into chars */ 05813 spat = rb_reg_regcomp(spat); 05814 split_type = regexp; 05815 } 05816 else if (rb_enc_asciicompat(enc2) == 1) { 05817 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){ 05818 split_type = awk; 05819 } 05820 } 05821 else { 05822 int l; 05823 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' && 05824 RSTRING_LEN(spat) == l) { 05825 split_type = awk; 05826 } 05827 } 05828 } 05829 else { 05830 spat = get_pat(spat, 1); 05831 split_type = regexp; 05832 } 05833 } 05834 05835 result = rb_ary_new(); 05836 beg = 0; 05837 if (split_type == awk) { 05838 char *ptr = RSTRING_PTR(str); 05839 char *eptr = RSTRING_END(str); 05840 char *bptr = ptr; 05841 int skip = 1; 05842 unsigned int c; 05843 05844 end = beg; 05845 if (is_ascii_string(str)) { 05846 while (ptr < eptr) { 05847 c = (unsigned char)*ptr++; 05848 if (skip) { 05849 if (ascii_isspace(c)) { 05850 beg = ptr - bptr; 05851 } 05852 else { 05853 end = ptr - bptr; 05854 skip = 0; 05855 if (!NIL_P(limit) && lim <= i) break; 05856 } 05857 } 05858 else if (ascii_isspace(c)) { 05859 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05860 skip = 1; 05861 beg = ptr - bptr; 05862 if (!NIL_P(limit)) ++i; 05863 } 05864 else { 05865 end = ptr - bptr; 05866 } 05867 } 05868 } 05869 else { 05870 while (ptr < eptr) { 05871 int n; 05872 05873 c = rb_enc_codepoint_len(ptr, eptr, &n, enc); 05874 ptr += n; 05875 if (skip) { 05876 if (rb_isspace(c)) { 05877 beg = ptr - bptr; 05878 } 05879 else { 05880 end = ptr - bptr; 05881 skip = 0; 05882 if (!NIL_P(limit) && lim <= i) break; 05883 } 05884 } 05885 else if (rb_isspace(c)) { 05886 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05887 skip = 1; 05888 beg = ptr - bptr; 05889 if (!NIL_P(limit)) ++i; 05890 } 05891 else { 05892 end = ptr - bptr; 05893 } 05894 } 05895 } 05896 } 05897 else if (split_type == string) { 05898 char *ptr = RSTRING_PTR(str); 05899 char *temp = ptr; 05900 char *eptr = RSTRING_END(str); 05901 char *sptr = RSTRING_PTR(spat); 05902 long slen = RSTRING_LEN(spat); 05903 05904 if (is_broken_string(str)) { 05905 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str))); 05906 } 05907 if (is_broken_string(spat)) { 05908 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat))); 05909 } 05910 enc = rb_enc_check(str, spat); 05911 while (ptr < eptr && 05912 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) { 05913 /* Check we are at the start of a char */ 05914 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc); 05915 if (t != ptr + end) { 05916 ptr = t; 05917 continue; 05918 } 05919 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end)); 05920 ptr += end + slen; 05921 if (!NIL_P(limit) && lim <= ++i) break; 05922 } 05923 beg = ptr - temp; 05924 } 05925 else { 05926 char *ptr = RSTRING_PTR(str); 05927 long len = RSTRING_LEN(str); 05928 long start = beg; 05929 long idx; 05930 int last_null = 0; 05931 struct re_registers *regs; 05932 05933 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) { 05934 regs = RMATCH_REGS(rb_backref_get()); 05935 if (start == end && BEG(0) == END(0)) { 05936 if (!ptr) { 05937 rb_ary_push(result, str_new_empty(str)); 05938 break; 05939 } 05940 else if (last_null == 1) { 05941 rb_ary_push(result, rb_str_subseq(str, beg, 05942 rb_enc_fast_mbclen(ptr+beg, 05943 ptr+len, 05944 enc))); 05945 beg = start; 05946 } 05947 else { 05948 if (ptr+start == ptr+len) 05949 start++; 05950 else 05951 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc); 05952 last_null = 1; 05953 continue; 05954 } 05955 } 05956 else { 05957 rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); 05958 beg = start = END(0); 05959 } 05960 last_null = 0; 05961 05962 for (idx=1; idx < regs->num_regs; idx++) { 05963 if (BEG(idx) == -1) continue; 05964 if (BEG(idx) == END(idx)) 05965 tmp = str_new_empty(str); 05966 else 05967 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx)); 05968 rb_ary_push(result, tmp); 05969 } 05970 if (!NIL_P(limit) && lim <= ++i) break; 05971 } 05972 } 05973 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) { 05974 if (RSTRING_LEN(str) == beg) 05975 tmp = str_new_empty(str); 05976 else 05977 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg); 05978 rb_ary_push(result, tmp); 05979 } 05980 if (NIL_P(limit) && lim == 0) { 05981 long len; 05982 while ((len = RARRAY_LEN(result)) > 0 && 05983 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0)) 05984 rb_ary_pop(result); 05985 } 05986 05987 return result; 05988 } 05989 05990 VALUE 05991 rb_str_split(VALUE str, const char *sep0) 05992 { 05993 VALUE sep; 05994 05995 StringValue(str); 05996 sep = rb_str_new2(sep0); 05997 return rb_str_split_m(1, &sep, str); 05998 } 05999 06000 06001 /* 06002 * call-seq: 06003 * str.each_line(separator=$/) {|substr| block } -> str 06004 * str.each_line(separator=$/) -> an_enumerator 06005 * 06006 * str.lines(separator=$/) {|substr| block } -> str 06007 * str.lines(separator=$/) -> an_enumerator 06008 * 06009 * Splits <i>str</i> using the supplied parameter as the record separator 06010 * (<code>$/</code> by default), passing each substring in turn to the supplied 06011 * block. If a zero-length record separator is supplied, the string is split 06012 * into paragraphs delimited by multiple successive newlines. 06013 * 06014 * If no block is given, an enumerator is returned instead. 06015 * 06016 * print "Example one\n" 06017 * "hello\nworld".each_line {|s| p s} 06018 * print "Example two\n" 06019 * "hello\nworld".each_line('l') {|s| p s} 06020 * print "Example three\n" 06021 * "hello\n\n\nworld".each_line('') {|s| p s} 06022 * 06023 * <em>produces:</em> 06024 * 06025 * Example one 06026 * "hello\n" 06027 * "world" 06028 * Example two 06029 * "hel" 06030 * "l" 06031 * "o\nworl" 06032 * "d" 06033 * Example three 06034 * "hello\n\n\n" 06035 * "world" 06036 */ 06037 06038 static VALUE 06039 rb_str_each_line(int argc, VALUE *argv, VALUE str) 06040 { 06041 rb_encoding *enc; 06042 VALUE rs; 06043 unsigned int newline; 06044 const char *p, *pend, *s, *ptr; 06045 long len, rslen; 06046 VALUE line; 06047 int n; 06048 VALUE orig = str; 06049 06050 if (argc == 0) { 06051 rs = rb_rs; 06052 } 06053 else { 06054 rb_scan_args(argc, argv, "01", &rs); 06055 } 06056 RETURN_ENUMERATOR(str, argc, argv); 06057 if (NIL_P(rs)) { 06058 rb_yield(str); 06059 return orig; 06060 } 06061 str = rb_str_new4(str); 06062 ptr = p = s = RSTRING_PTR(str); 06063 pend = p + RSTRING_LEN(str); 06064 len = RSTRING_LEN(str); 06065 StringValue(rs); 06066 if (rs == rb_default_rs) { 06067 enc = rb_enc_get(str); 06068 while (p < pend) { 06069 char *p0; 06070 06071 p = memchr(p, '\n', pend - p); 06072 if (!p) break; 06073 p0 = rb_enc_left_char_head(s, p, pend, enc); 06074 if (!rb_enc_is_newline(p0, pend, enc)) { 06075 p++; 06076 continue; 06077 } 06078 p = p0 + rb_enc_mbclen(p0, pend, enc); 06079 line = rb_str_new5(str, s, p - s); 06080 OBJ_INFECT(line, str); 06081 rb_enc_cr_str_copy_for_substr(line, str); 06082 rb_yield(line); 06083 str_mod_check(str, ptr, len); 06084 s = p; 06085 } 06086 goto finish; 06087 } 06088 06089 enc = rb_enc_check(str, rs); 06090 rslen = RSTRING_LEN(rs); 06091 if (rslen == 0) { 06092 newline = '\n'; 06093 } 06094 else { 06095 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); 06096 } 06097 06098 while (p < pend) { 06099 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc); 06100 06101 again: 06102 if (rslen == 0 && c == newline) { 06103 p += n; 06104 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) { 06105 goto again; 06106 } 06107 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) { 06108 p += n; 06109 } 06110 p -= n; 06111 } 06112 if (c == newline && 06113 (rslen <= 1 || 06114 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) { 06115 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n)); 06116 OBJ_INFECT(line, str); 06117 rb_enc_cr_str_copy_for_substr(line, str); 06118 rb_yield(line); 06119 str_mod_check(str, ptr, len); 06120 s = p + (rslen ? rslen : n); 06121 } 06122 p += n; 06123 } 06124 06125 finish: 06126 if (s != pend) { 06127 line = rb_str_new5(str, s, pend - s); 06128 OBJ_INFECT(line, str); 06129 rb_enc_cr_str_copy_for_substr(line, str); 06130 rb_yield(line); 06131 } 06132 06133 return orig; 06134 } 06135 06136 06137 /* 06138 * call-seq: 06139 * str.bytes {|fixnum| block } -> str 06140 * str.bytes -> an_enumerator 06141 * 06142 * str.each_byte {|fixnum| block } -> str 06143 * str.each_byte -> an_enumerator 06144 * 06145 * Passes each byte in <i>str</i> to the given block, or returns 06146 * an enumerator if no block is given. 06147 * 06148 * "hello".each_byte {|c| print c, ' ' } 06149 * 06150 * <em>produces:</em> 06151 * 06152 * 104 101 108 108 111 06153 */ 06154 06155 static VALUE 06156 rb_str_each_byte(VALUE str) 06157 { 06158 long i; 06159 06160 RETURN_ENUMERATOR(str, 0, 0); 06161 for (i=0; i<RSTRING_LEN(str); i++) { 06162 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff)); 06163 } 06164 return str; 06165 } 06166 06167 06168 /* 06169 * call-seq: 06170 * str.chars {|cstr| block } -> str 06171 * str.chars -> an_enumerator 06172 * 06173 * str.each_char {|cstr| block } -> str 06174 * str.each_char -> an_enumerator 06175 * 06176 * Passes each character in <i>str</i> to the given block, or returns 06177 * an enumerator if no block is given. 06178 * 06179 * "hello".each_char {|c| print c, ' ' } 06180 * 06181 * <em>produces:</em> 06182 * 06183 * h e l l o 06184 */ 06185 06186 static VALUE 06187 rb_str_each_char(VALUE str) 06188 { 06189 VALUE orig = str; 06190 long i, len, n; 06191 const char *ptr; 06192 rb_encoding *enc; 06193 06194 RETURN_ENUMERATOR(str, 0, 0); 06195 str = rb_str_new4(str); 06196 ptr = RSTRING_PTR(str); 06197 len = RSTRING_LEN(str); 06198 enc = rb_enc_get(str); 06199 switch (ENC_CODERANGE(str)) { 06200 case ENC_CODERANGE_VALID: 06201 case ENC_CODERANGE_7BIT: 06202 for (i = 0; i < len; i += n) { 06203 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); 06204 rb_yield(rb_str_subseq(str, i, n)); 06205 } 06206 break; 06207 default: 06208 for (i = 0; i < len; i += n) { 06209 n = rb_enc_mbclen(ptr + i, ptr + len, enc); 06210 rb_yield(rb_str_subseq(str, i, n)); 06211 } 06212 } 06213 return orig; 06214 } 06215 06216 /* 06217 * call-seq: 06218 * str.codepoints {|integer| block } -> str 06219 * str.codepoints -> an_enumerator 06220 * 06221 * str.each_codepoint {|integer| block } -> str 06222 * str.each_codepoint -> an_enumerator 06223 * 06224 * Passes the <code>Integer</code> ordinal of each character in <i>str</i>, 06225 * also known as a <i>codepoint</i> when applied to Unicode strings to the 06226 * given block. 06227 * 06228 * If no block is given, an enumerator is returned instead. 06229 * 06230 * "hello\u0639".each_codepoint {|c| print c, ' ' } 06231 * 06232 * <em>produces:</em> 06233 * 06234 * 104 101 108 108 111 1593 06235 */ 06236 06237 static VALUE 06238 rb_str_each_codepoint(VALUE str) 06239 { 06240 VALUE orig = str; 06241 int n; 06242 unsigned int c; 06243 const char *ptr, *end; 06244 rb_encoding *enc; 06245 06246 if (single_byte_optimizable(str)) return rb_str_each_byte(str); 06247 RETURN_ENUMERATOR(str, 0, 0); 06248 str = rb_str_new4(str); 06249 ptr = RSTRING_PTR(str); 06250 end = RSTRING_END(str); 06251 enc = STR_ENC_GET(str); 06252 while (ptr < end) { 06253 c = rb_enc_codepoint_len(ptr, end, &n, enc); 06254 rb_yield(UINT2NUM(c)); 06255 ptr += n; 06256 } 06257 return orig; 06258 } 06259 06260 static long 06261 chopped_length(VALUE str) 06262 { 06263 rb_encoding *enc = STR_ENC_GET(str); 06264 const char *p, *p2, *beg, *end; 06265 06266 beg = RSTRING_PTR(str); 06267 end = beg + RSTRING_LEN(str); 06268 if (beg > end) return 0; 06269 p = rb_enc_prev_char(beg, end, end, enc); 06270 if (!p) return 0; 06271 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') { 06272 p2 = rb_enc_prev_char(beg, p, end, enc); 06273 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2; 06274 } 06275 return p - beg; 06276 } 06277 06278 /* 06279 * call-seq: 06280 * str.chop! -> str or nil 06281 * 06282 * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>, 06283 * or <code>nil</code> if <i>str</i> is the empty string. See also 06284 * <code>String#chomp!</code>. 06285 */ 06286 06287 static VALUE 06288 rb_str_chop_bang(VALUE str) 06289 { 06290 str_modify_keep_cr(str); 06291 if (RSTRING_LEN(str) > 0) { 06292 long len; 06293 len = chopped_length(str); 06294 STR_SET_LEN(str, len); 06295 RSTRING_PTR(str)[len] = '\0'; 06296 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06297 ENC_CODERANGE_CLEAR(str); 06298 } 06299 return str; 06300 } 06301 return Qnil; 06302 } 06303 06304 06305 /* 06306 * call-seq: 06307 * str.chop -> new_str 06308 * 06309 * Returns a new <code>String</code> with the last character removed. If the 06310 * string ends with <code>\r\n</code>, both characters are removed. Applying 06311 * <code>chop</code> to an empty string returns an empty 06312 * string. <code>String#chomp</code> is often a safer alternative, as it leaves 06313 * the string unchanged if it doesn't end in a record separator. 06314 * 06315 * "string\r\n".chop #=> "string" 06316 * "string\n\r".chop #=> "string\n" 06317 * "string\n".chop #=> "string" 06318 * "string".chop #=> "strin" 06319 * "x".chop.chop #=> "" 06320 */ 06321 06322 static VALUE 06323 rb_str_chop(VALUE str) 06324 { 06325 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str)); 06326 rb_enc_cr_str_copy_for_substr(str2, str); 06327 OBJ_INFECT(str2, str); 06328 return str2; 06329 } 06330 06331 06332 /* 06333 * call-seq: 06334 * str.chomp!(separator=$/) -> str or nil 06335 * 06336 * Modifies <i>str</i> in place as described for <code>String#chomp</code>, 06337 * returning <i>str</i>, or <code>nil</code> if no modifications were made. 06338 */ 06339 06340 static VALUE 06341 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str) 06342 { 06343 rb_encoding *enc; 06344 VALUE rs; 06345 int newline; 06346 char *p, *pp, *e; 06347 long len, rslen; 06348 06349 str_modify_keep_cr(str); 06350 len = RSTRING_LEN(str); 06351 if (len == 0) return Qnil; 06352 p = RSTRING_PTR(str); 06353 e = p + len; 06354 if (argc == 0) { 06355 rs = rb_rs; 06356 if (rs == rb_default_rs) { 06357 smart_chomp: 06358 enc = rb_enc_get(str); 06359 if (rb_enc_mbminlen(enc) > 1) { 06360 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc); 06361 if (rb_enc_is_newline(pp, e, enc)) { 06362 e = pp; 06363 } 06364 pp = e - rb_enc_mbminlen(enc); 06365 if (pp >= p) { 06366 pp = rb_enc_left_char_head(p, pp, e, enc); 06367 if (rb_enc_ascget(pp, e, 0, enc) == '\r') { 06368 e = pp; 06369 } 06370 } 06371 if (e == RSTRING_END(str)) { 06372 return Qnil; 06373 } 06374 len = e - RSTRING_PTR(str); 06375 STR_SET_LEN(str, len); 06376 } 06377 else { 06378 if (RSTRING_PTR(str)[len-1] == '\n') { 06379 STR_DEC_LEN(str); 06380 if (RSTRING_LEN(str) > 0 && 06381 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') { 06382 STR_DEC_LEN(str); 06383 } 06384 } 06385 else if (RSTRING_PTR(str)[len-1] == '\r') { 06386 STR_DEC_LEN(str); 06387 } 06388 else { 06389 return Qnil; 06390 } 06391 } 06392 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06393 return str; 06394 } 06395 } 06396 else { 06397 rb_scan_args(argc, argv, "01", &rs); 06398 } 06399 if (NIL_P(rs)) return Qnil; 06400 StringValue(rs); 06401 rslen = RSTRING_LEN(rs); 06402 if (rslen == 0) { 06403 while (len>0 && p[len-1] == '\n') { 06404 len--; 06405 if (len>0 && p[len-1] == '\r') 06406 len--; 06407 } 06408 if (len < RSTRING_LEN(str)) { 06409 STR_SET_LEN(str, len); 06410 RSTRING_PTR(str)[len] = '\0'; 06411 return str; 06412 } 06413 return Qnil; 06414 } 06415 if (rslen > len) return Qnil; 06416 newline = RSTRING_PTR(rs)[rslen-1]; 06417 if (rslen == 1 && newline == '\n') 06418 goto smart_chomp; 06419 06420 enc = rb_enc_check(str, rs); 06421 if (is_broken_string(rs)) { 06422 return Qnil; 06423 } 06424 pp = e - rslen; 06425 if (p[len-1] == newline && 06426 (rslen <= 1 || 06427 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) { 06428 if (rb_enc_left_char_head(p, pp, e, enc) != pp) 06429 return Qnil; 06430 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) { 06431 ENC_CODERANGE_CLEAR(str); 06432 } 06433 STR_SET_LEN(str, RSTRING_LEN(str) - rslen); 06434 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06435 return str; 06436 } 06437 return Qnil; 06438 } 06439 06440 06441 /* 06442 * call-seq: 06443 * str.chomp(separator=$/) -> new_str 06444 * 06445 * Returns a new <code>String</code> with the given record separator removed 06446 * from the end of <i>str</i> (if present). If <code>$/</code> has not been 06447 * changed from the default Ruby record separator, then <code>chomp</code> also 06448 * removes carriage return characters (that is it will remove <code>\n</code>, 06449 * <code>\r</code>, and <code>\r\n</code>). 06450 * 06451 * "hello".chomp #=> "hello" 06452 * "hello\n".chomp #=> "hello" 06453 * "hello\r\n".chomp #=> "hello" 06454 * "hello\n\r".chomp #=> "hello\n" 06455 * "hello\r".chomp #=> "hello" 06456 * "hello \n there".chomp #=> "hello \n there" 06457 * "hello".chomp("llo") #=> "he" 06458 */ 06459 06460 static VALUE 06461 rb_str_chomp(int argc, VALUE *argv, VALUE str) 06462 { 06463 str = rb_str_dup(str); 06464 rb_str_chomp_bang(argc, argv, str); 06465 return str; 06466 } 06467 06468 /* 06469 * call-seq: 06470 * str.lstrip! -> self or nil 06471 * 06472 * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no 06473 * change was made. See also <code>String#rstrip!</code> and 06474 * <code>String#strip!</code>. 06475 * 06476 * " hello ".lstrip #=> "hello " 06477 * "hello".lstrip! #=> nil 06478 */ 06479 06480 static VALUE 06481 rb_str_lstrip_bang(VALUE str) 06482 { 06483 rb_encoding *enc; 06484 char *s, *t, *e; 06485 06486 str_modify_keep_cr(str); 06487 enc = STR_ENC_GET(str); 06488 s = RSTRING_PTR(str); 06489 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06490 e = t = RSTRING_END(str); 06491 /* remove spaces at head */ 06492 while (s < e) { 06493 int n; 06494 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc); 06495 06496 if (!rb_isspace(cc)) break; 06497 s += n; 06498 } 06499 06500 if (s > RSTRING_PTR(str)) { 06501 STR_SET_LEN(str, t-s); 06502 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str)); 06503 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; 06504 return str; 06505 } 06506 return Qnil; 06507 } 06508 06509 06510 /* 06511 * call-seq: 06512 * str.lstrip -> new_str 06513 * 06514 * Returns a copy of <i>str</i> with leading whitespace removed. See also 06515 * <code>String#rstrip</code> and <code>String#strip</code>. 06516 * 06517 * " hello ".lstrip #=> "hello " 06518 * "hello".lstrip #=> "hello" 06519 */ 06520 06521 static VALUE 06522 rb_str_lstrip(VALUE str) 06523 { 06524 str = rb_str_dup(str); 06525 rb_str_lstrip_bang(str); 06526 return str; 06527 } 06528 06529 06530 /* 06531 * call-seq: 06532 * str.rstrip! -> self or nil 06533 * 06534 * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if 06535 * no change was made. See also <code>String#lstrip!</code> and 06536 * <code>String#strip!</code>. 06537 * 06538 * " hello ".rstrip #=> " hello" 06539 * "hello".rstrip! #=> nil 06540 */ 06541 06542 static VALUE 06543 rb_str_rstrip_bang(VALUE str) 06544 { 06545 rb_encoding *enc; 06546 char *s, *t, *e; 06547 06548 str_modify_keep_cr(str); 06549 enc = STR_ENC_GET(str); 06550 rb_str_check_dummy_enc(enc); 06551 s = RSTRING_PTR(str); 06552 if (!s || RSTRING_LEN(str) == 0) return Qnil; 06553 t = e = RSTRING_END(str); 06554 06555 /* remove trailing spaces or '\0's */ 06556 if (single_byte_optimizable(str)) { 06557 unsigned char c; 06558 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--; 06559 } 06560 else { 06561 char *tp; 06562 06563 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) { 06564 unsigned int c = rb_enc_codepoint(tp, e, enc); 06565 if (c && !rb_isspace(c)) break; 06566 t = tp; 06567 } 06568 } 06569 if (t < e) { 06570 long len = t-RSTRING_PTR(str); 06571 06572 STR_SET_LEN(str, len); 06573 RSTRING_PTR(str)[len] = '\0'; 06574 return str; 06575 } 06576 return Qnil; 06577 } 06578 06579 06580 /* 06581 * call-seq: 06582 * str.rstrip -> new_str 06583 * 06584 * Returns a copy of <i>str</i> with trailing whitespace removed. See also 06585 * <code>String#lstrip</code> and <code>String#strip</code>. 06586 * 06587 * " hello ".rstrip #=> " hello" 06588 * "hello".rstrip #=> "hello" 06589 */ 06590 06591 static VALUE 06592 rb_str_rstrip(VALUE str) 06593 { 06594 str = rb_str_dup(str); 06595 rb_str_rstrip_bang(str); 06596 return str; 06597 } 06598 06599 06600 /* 06601 * call-seq: 06602 * str.strip! -> str or nil 06603 * 06604 * Removes leading and trailing whitespace from <i>str</i>. Returns 06605 * <code>nil</code> if <i>str</i> was not altered. 06606 */ 06607 06608 static VALUE 06609 rb_str_strip_bang(VALUE str) 06610 { 06611 VALUE l = rb_str_lstrip_bang(str); 06612 VALUE r = rb_str_rstrip_bang(str); 06613 06614 if (NIL_P(l) && NIL_P(r)) return Qnil; 06615 return str; 06616 } 06617 06618 06619 /* 06620 * call-seq: 06621 * str.strip -> new_str 06622 * 06623 * Returns a copy of <i>str</i> with leading and trailing whitespace removed. 06624 * 06625 * " hello ".strip #=> "hello" 06626 * "\tgoodbye\r\n".strip #=> "goodbye" 06627 */ 06628 06629 static VALUE 06630 rb_str_strip(VALUE str) 06631 { 06632 str = rb_str_dup(str); 06633 rb_str_strip_bang(str); 06634 return str; 06635 } 06636 06637 static VALUE 06638 scan_once(VALUE str, VALUE pat, long *start) 06639 { 06640 VALUE result, match; 06641 struct re_registers *regs; 06642 int i; 06643 06644 if (rb_reg_search(pat, str, *start, 0) >= 0) { 06645 match = rb_backref_get(); 06646 regs = RMATCH_REGS(match); 06647 if (BEG(0) == END(0)) { 06648 rb_encoding *enc = STR_ENC_GET(str); 06649 /* 06650 * Always consume at least one character of the input string 06651 */ 06652 if (RSTRING_LEN(str) > END(0)) 06653 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0), 06654 RSTRING_END(str), enc); 06655 else 06656 *start = END(0)+1; 06657 } 06658 else { 06659 *start = END(0); 06660 } 06661 if (regs->num_regs == 1) { 06662 return rb_reg_nth_match(0, match); 06663 } 06664 result = rb_ary_new2(regs->num_regs); 06665 for (i=1; i < regs->num_regs; i++) { 06666 rb_ary_push(result, rb_reg_nth_match(i, match)); 06667 } 06668 06669 return result; 06670 } 06671 return Qnil; 06672 } 06673 06674 06675 /* 06676 * call-seq: 06677 * str.scan(pattern) -> array 06678 * str.scan(pattern) {|match, ...| block } -> str 06679 * 06680 * Both forms iterate through <i>str</i>, matching the pattern (which may be a 06681 * <code>Regexp</code> or a <code>String</code>). For each match, a result is 06682 * generated and either added to the result array or passed to the block. If 06683 * the pattern contains no groups, each individual result consists of the 06684 * matched string, <code>$&</code>. If the pattern contains groups, each 06685 * individual result is itself an array containing one entry per group. 06686 * 06687 * a = "cruel world" 06688 * a.scan(/\w+/) #=> ["cruel", "world"] 06689 * a.scan(/.../) #=> ["cru", "el ", "wor"] 06690 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]] 06691 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]] 06692 * 06693 * And the block form: 06694 * 06695 * a.scan(/\w+/) {|w| print "<<#{w}>> " } 06696 * print "\n" 06697 * a.scan(/(.)(.)/) {|x,y| print y, x } 06698 * print "\n" 06699 * 06700 * <em>produces:</em> 06701 * 06702 * <<cruel>> <<world>> 06703 * rceu lowlr 06704 */ 06705 06706 static VALUE 06707 rb_str_scan(VALUE str, VALUE pat) 06708 { 06709 VALUE result; 06710 long start = 0; 06711 long last = -1, prev = 0; 06712 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str); 06713 06714 pat = get_pat(pat, 1); 06715 if (!rb_block_given_p()) { 06716 VALUE ary = rb_ary_new(); 06717 06718 while (!NIL_P(result = scan_once(str, pat, &start))) { 06719 last = prev; 06720 prev = start; 06721 rb_ary_push(ary, result); 06722 } 06723 if (last >= 0) rb_reg_search(pat, str, last, 0); 06724 return ary; 06725 } 06726 06727 while (!NIL_P(result = scan_once(str, pat, &start))) { 06728 last = prev; 06729 prev = start; 06730 rb_yield(result); 06731 str_mod_check(str, p, len); 06732 } 06733 if (last >= 0) rb_reg_search(pat, str, last, 0); 06734 return str; 06735 } 06736 06737 06738 /* 06739 * call-seq: 06740 * str.hex -> integer 06741 * 06742 * Treats leading characters from <i>str</i> as a string of hexadecimal digits 06743 * (with an optional sign and an optional <code>0x</code>) and returns the 06744 * corresponding number. Zero is returned on error. 06745 * 06746 * "0x0a".hex #=> 10 06747 * "-1234".hex #=> -4660 06748 * "0".hex #=> 0 06749 * "wombat".hex #=> 0 06750 */ 06751 06752 static VALUE 06753 rb_str_hex(VALUE str) 06754 { 06755 rb_encoding *enc = rb_enc_get(str); 06756 06757 if (!rb_enc_asciicompat(enc)) { 06758 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 06759 } 06760 return rb_str_to_inum(str, 16, FALSE); 06761 } 06762 06763 06764 /* 06765 * call-seq: 06766 * str.oct -> integer 06767 * 06768 * Treats leading characters of <i>str</i> as a string of octal digits (with an 06769 * optional sign) and returns the corresponding number. Returns 0 if the 06770 * conversion fails. 06771 * 06772 * "123".oct #=> 83 06773 * "-377".oct #=> -255 06774 * "bad".oct #=> 0 06775 * "0377bad".oct #=> 255 06776 */ 06777 06778 static VALUE 06779 rb_str_oct(VALUE str) 06780 { 06781 rb_encoding *enc = rb_enc_get(str); 06782 06783 if (!rb_enc_asciicompat(enc)) { 06784 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc)); 06785 } 06786 return rb_str_to_inum(str, -8, FALSE); 06787 } 06788 06789 06790 /* 06791 * call-seq: 06792 * str.crypt(other_str) -> new_str 06793 * 06794 * Applies a one-way cryptographic hash to <i>str</i> by invoking the standard 06795 * library function <code>crypt</code>. The argument is the salt string, which 06796 * should be two characters long, each character drawn from 06797 * <code>[a-zA-Z0-9./]</code>. 06798 */ 06799 06800 static VALUE 06801 rb_str_crypt(VALUE str, VALUE salt) 06802 { 06803 extern char *crypt(const char *, const char *); 06804 VALUE result; 06805 const char *s, *saltp; 06806 char *res; 06807 #ifdef BROKEN_CRYPT 06808 char salt_8bit_clean[3]; 06809 #endif 06810 06811 StringValue(salt); 06812 if (RSTRING_LEN(salt) < 2) 06813 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)"); 06814 06815 s = RSTRING_PTR(str); 06816 if (!s) s = ""; 06817 saltp = RSTRING_PTR(salt); 06818 #ifdef BROKEN_CRYPT 06819 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) { 06820 salt_8bit_clean[0] = saltp[0] & 0x7f; 06821 salt_8bit_clean[1] = saltp[1] & 0x7f; 06822 salt_8bit_clean[2] = '\0'; 06823 saltp = salt_8bit_clean; 06824 } 06825 #endif 06826 res = crypt(s, saltp); 06827 if (!res) { 06828 rb_sys_fail("crypt"); 06829 } 06830 result = rb_str_new2(res); 06831 OBJ_INFECT(result, str); 06832 OBJ_INFECT(result, salt); 06833 return result; 06834 } 06835 06836 06837 /* 06838 * call-seq: 06839 * str.intern -> symbol 06840 * str.to_sym -> symbol 06841 * 06842 * Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the 06843 * symbol if it did not previously exist. See <code>Symbol#id2name</code>. 06844 * 06845 * "Koala".intern #=> :Koala 06846 * s = 'cat'.to_sym #=> :cat 06847 * s == :cat #=> true 06848 * s = '@cat'.to_sym #=> :@cat 06849 * s == :@cat #=> true 06850 * 06851 * This can also be used to create symbols that cannot be represented using the 06852 * <code>:xxx</code> notation. 06853 * 06854 * 'cat and dog'.to_sym #=> :"cat and dog" 06855 */ 06856 06857 VALUE 06858 rb_str_intern(VALUE s) 06859 { 06860 VALUE str = RB_GC_GUARD(s); 06861 ID id; 06862 06863 id = rb_intern_str(str); 06864 return ID2SYM(id); 06865 } 06866 06867 06868 /* 06869 * call-seq: 06870 * str.ord -> integer 06871 * 06872 * Return the <code>Integer</code> ordinal of a one-character string. 06873 * 06874 * "a".ord #=> 97 06875 */ 06876 06877 VALUE 06878 rb_str_ord(VALUE s) 06879 { 06880 unsigned int c; 06881 06882 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s)); 06883 return UINT2NUM(c); 06884 } 06885 /* 06886 * call-seq: 06887 * str.sum(n=16) -> integer 06888 * 06889 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>, 06890 * where <em>n</em> is the optional <code>Fixnum</code> parameter, defaulting 06891 * to 16. The result is simply the sum of the binary value of each character in 06892 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good 06893 * checksum. 06894 */ 06895 06896 static VALUE 06897 rb_str_sum(int argc, VALUE *argv, VALUE str) 06898 { 06899 VALUE vbits; 06900 int bits; 06901 char *ptr, *p, *pend; 06902 long len; 06903 VALUE sum = INT2FIX(0); 06904 unsigned long sum0 = 0; 06905 06906 if (argc == 0) { 06907 bits = 16; 06908 } 06909 else { 06910 rb_scan_args(argc, argv, "01", &vbits); 06911 bits = NUM2INT(vbits); 06912 } 06913 ptr = p = RSTRING_PTR(str); 06914 len = RSTRING_LEN(str); 06915 pend = p + len; 06916 06917 while (p < pend) { 06918 if (FIXNUM_MAX - UCHAR_MAX < sum0) { 06919 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06920 str_mod_check(str, ptr, len); 06921 sum0 = 0; 06922 } 06923 sum0 += (unsigned char)*p; 06924 p++; 06925 } 06926 06927 if (bits == 0) { 06928 if (sum0) { 06929 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06930 } 06931 } 06932 else { 06933 if (sum == INT2FIX(0)) { 06934 if (bits < (int)sizeof(long)*CHAR_BIT) { 06935 sum0 &= (((unsigned long)1)<<bits)-1; 06936 } 06937 sum = LONG2FIX(sum0); 06938 } 06939 else { 06940 VALUE mod; 06941 06942 if (sum0) { 06943 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0)); 06944 } 06945 06946 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits)); 06947 mod = rb_funcall(mod, '-', 1, INT2FIX(1)); 06948 sum = rb_funcall(sum, '&', 1, mod); 06949 } 06950 } 06951 return sum; 06952 } 06953 06954 static VALUE 06955 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) 06956 { 06957 rb_encoding *enc; 06958 VALUE w; 06959 long width, len, flen = 1, fclen = 1; 06960 VALUE res; 06961 char *p; 06962 const char *f = " "; 06963 long n, size, llen, rlen, llen2 = 0, rlen2 = 0; 06964 volatile VALUE pad; 06965 int singlebyte = 1, cr; 06966 06967 rb_scan_args(argc, argv, "11", &w, &pad); 06968 enc = STR_ENC_GET(str); 06969 width = NUM2LONG(w); 06970 if (argc == 2) { 06971 StringValue(pad); 06972 enc = rb_enc_check(str, pad); 06973 f = RSTRING_PTR(pad); 06974 flen = RSTRING_LEN(pad); 06975 fclen = str_strlen(pad, enc); 06976 singlebyte = single_byte_optimizable(pad); 06977 if (flen == 0 || fclen == 0) { 06978 rb_raise(rb_eArgError, "zero width padding"); 06979 } 06980 } 06981 len = str_strlen(str, enc); 06982 if (width < 0 || len >= width) return rb_str_dup(str); 06983 n = width - len; 06984 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); 06985 rlen = n - llen; 06986 cr = ENC_CODERANGE(str); 06987 if (flen > 1) { 06988 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte); 06989 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte); 06990 } 06991 size = RSTRING_LEN(str); 06992 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen || 06993 (len *= flen) >= LONG_MAX - llen2 - rlen2 || 06994 (len += llen2 + rlen2) >= LONG_MAX - size) { 06995 rb_raise(rb_eArgError, "argument too big"); 06996 } 06997 len += size; 06998 res = rb_str_new5(str, 0, len); 06999 p = RSTRING_PTR(res); 07000 if (flen <= 1) { 07001 memset(p, *f, llen); 07002 p += llen; 07003 } 07004 else { 07005 while (llen >= fclen) { 07006 memcpy(p,f,flen); 07007 p += flen; 07008 llen -= fclen; 07009 } 07010 if (llen > 0) { 07011 memcpy(p, f, llen2); 07012 p += llen2; 07013 } 07014 } 07015 memcpy(p, RSTRING_PTR(str), size); 07016 p += size; 07017 if (flen <= 1) { 07018 memset(p, *f, rlen); 07019 p += rlen; 07020 } 07021 else { 07022 while (rlen >= fclen) { 07023 memcpy(p,f,flen); 07024 p += flen; 07025 rlen -= fclen; 07026 } 07027 if (rlen > 0) { 07028 memcpy(p, f, rlen2); 07029 p += rlen2; 07030 } 07031 } 07032 *p = '\0'; 07033 STR_SET_LEN(res, p-RSTRING_PTR(res)); 07034 OBJ_INFECT(res, str); 07035 if (!NIL_P(pad)) OBJ_INFECT(res, pad); 07036 rb_enc_associate(res, enc); 07037 if (argc == 2) 07038 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad)); 07039 if (cr != ENC_CODERANGE_BROKEN) 07040 ENC_CODERANGE_SET(res, cr); 07041 return res; 07042 } 07043 07044 07045 /* 07046 * call-seq: 07047 * str.ljust(integer, padstr=' ') -> new_str 07048 * 07049 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07050 * <code>String</code> of length <i>integer</i> with <i>str</i> left justified 07051 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07052 * 07053 * "hello".ljust(4) #=> "hello" 07054 * "hello".ljust(20) #=> "hello " 07055 * "hello".ljust(20, '1234') #=> "hello123412341234123" 07056 */ 07057 07058 static VALUE 07059 rb_str_ljust(int argc, VALUE *argv, VALUE str) 07060 { 07061 return rb_str_justify(argc, argv, str, 'l'); 07062 } 07063 07064 07065 /* 07066 * call-seq: 07067 * str.rjust(integer, padstr=' ') -> new_str 07068 * 07069 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07070 * <code>String</code> of length <i>integer</i> with <i>str</i> right justified 07071 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07072 * 07073 * "hello".rjust(4) #=> "hello" 07074 * "hello".rjust(20) #=> " hello" 07075 * "hello".rjust(20, '1234') #=> "123412341234123hello" 07076 */ 07077 07078 static VALUE 07079 rb_str_rjust(int argc, VALUE *argv, VALUE str) 07080 { 07081 return rb_str_justify(argc, argv, str, 'r'); 07082 } 07083 07084 07085 /* 07086 * call-seq: 07087 * str.center(integer, padstr) -> new_str 07088 * 07089 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new 07090 * <code>String</code> of length <i>integer</i> with <i>str</i> centered and 07091 * padded with <i>padstr</i>; otherwise, returns <i>str</i>. 07092 * 07093 * "hello".center(4) #=> "hello" 07094 * "hello".center(20) #=> " hello " 07095 * "hello".center(20, '123') #=> "1231231hello12312312" 07096 */ 07097 07098 static VALUE 07099 rb_str_center(int argc, VALUE *argv, VALUE str) 07100 { 07101 return rb_str_justify(argc, argv, str, 'c'); 07102 } 07103 07104 /* 07105 * call-seq: 07106 * str.partition(sep) -> [head, sep, tail] 07107 * str.partition(regexp) -> [head, match, tail] 07108 * 07109 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string 07110 * and returns the part before it, the match, and the part 07111 * after it. 07112 * If it is not found, returns two empty strings and <i>str</i>. 07113 * 07114 * "hello".partition("l") #=> ["he", "l", "lo"] 07115 * "hello".partition("x") #=> ["hello", "", ""] 07116 * "hello".partition(/.l/) #=> ["h", "el", "lo"] 07117 */ 07118 07119 static VALUE 07120 rb_str_partition(VALUE str, VALUE sep) 07121 { 07122 long pos; 07123 int regex = FALSE; 07124 07125 if (TYPE(sep) == T_REGEXP) { 07126 pos = rb_reg_search(sep, str, 0, 0); 07127 regex = TRUE; 07128 } 07129 else { 07130 VALUE tmp; 07131 07132 tmp = rb_check_string_type(sep); 07133 if (NIL_P(tmp)) { 07134 rb_raise(rb_eTypeError, "type mismatch: %s given", 07135 rb_obj_classname(sep)); 07136 } 07137 sep = tmp; 07138 pos = rb_str_index(str, sep, 0); 07139 } 07140 if (pos < 0) { 07141 failed: 07142 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str)); 07143 } 07144 if (regex) { 07145 sep = rb_str_subpat(str, sep, INT2FIX(0)); 07146 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed; 07147 } 07148 return rb_ary_new3(3, rb_str_subseq(str, 0, pos), 07149 sep, 07150 rb_str_subseq(str, pos+RSTRING_LEN(sep), 07151 RSTRING_LEN(str)-pos-RSTRING_LEN(sep))); 07152 } 07153 07154 /* 07155 * call-seq: 07156 * str.rpartition(sep) -> [head, sep, tail] 07157 * str.rpartition(regexp) -> [head, match, tail] 07158 * 07159 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end 07160 * of the string, and returns the part before it, the match, and the part 07161 * after it. 07162 * If it is not found, returns two empty strings and <i>str</i>. 07163 * 07164 * "hello".rpartition("l") #=> ["hel", "l", "o"] 07165 * "hello".rpartition("x") #=> ["", "", "hello"] 07166 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"] 07167 */ 07168 07169 static VALUE 07170 rb_str_rpartition(VALUE str, VALUE sep) 07171 { 07172 long pos = RSTRING_LEN(str); 07173 int regex = FALSE; 07174 07175 if (TYPE(sep) == T_REGEXP) { 07176 pos = rb_reg_search(sep, str, pos, 1); 07177 regex = TRUE; 07178 } 07179 else { 07180 VALUE tmp; 07181 07182 tmp = rb_check_string_type(sep); 07183 if (NIL_P(tmp)) { 07184 rb_raise(rb_eTypeError, "type mismatch: %s given", 07185 rb_obj_classname(sep)); 07186 } 07187 sep = tmp; 07188 pos = rb_str_sublen(str, pos); 07189 pos = rb_str_rindex(str, sep, pos); 07190 } 07191 if (pos < 0) { 07192 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str); 07193 } 07194 if (regex) { 07195 sep = rb_reg_nth_match(0, rb_backref_get()); 07196 } 07197 return rb_ary_new3(3, rb_str_substr(str, 0, pos), 07198 sep, 07199 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str))); 07200 } 07201 07202 /* 07203 * call-seq: 07204 * str.start_with?([prefix]+) -> true or false 07205 * 07206 * Returns true if <i>str</i> starts with one of the prefixes given. 07207 * 07208 * p "hello".start_with?("hell") #=> true 07209 * 07210 * # returns true if one of the prefixes matches. 07211 * p "hello".start_with?("heaven", "hell") #=> true 07212 * p "hello".start_with?("heaven", "paradise") #=> false 07213 * 07214 * 07215 * 07216 */ 07217 07218 static VALUE 07219 rb_str_start_with(int argc, VALUE *argv, VALUE str) 07220 { 07221 int i; 07222 07223 for (i=0; i<argc; i++) { 07224 VALUE tmp = rb_check_string_type(argv[i]); 07225 if (NIL_P(tmp)) continue; 07226 rb_enc_check(str, tmp); 07227 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07228 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07229 return Qtrue; 07230 } 07231 return Qfalse; 07232 } 07233 07234 /* 07235 * call-seq: 07236 * str.end_with?([suffix]+) -> true or false 07237 * 07238 * Returns true if <i>str</i> ends with one of the suffixes given. 07239 */ 07240 07241 static VALUE 07242 rb_str_end_with(int argc, VALUE *argv, VALUE str) 07243 { 07244 int i; 07245 char *p, *s, *e; 07246 rb_encoding *enc; 07247 07248 for (i=0; i<argc; i++) { 07249 VALUE tmp = rb_check_string_type(argv[i]); 07250 if (NIL_P(tmp)) continue; 07251 enc = rb_enc_check(str, tmp); 07252 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue; 07253 p = RSTRING_PTR(str); 07254 e = p + RSTRING_LEN(str); 07255 s = e - RSTRING_LEN(tmp); 07256 if (rb_enc_left_char_head(p, s, e, enc) != s) 07257 continue; 07258 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0) 07259 return Qtrue; 07260 } 07261 return Qfalse; 07262 } 07263 07264 void 07265 rb_str_setter(VALUE val, ID id, VALUE *var) 07266 { 07267 if (!NIL_P(val) && TYPE(val) != T_STRING) { 07268 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id)); 07269 } 07270 *var = val; 07271 } 07272 07273 07274 /* 07275 * call-seq: 07276 * str.force_encoding(encoding) -> str 07277 * 07278 * Changes the encoding to +encoding+ and returns self. 07279 */ 07280 07281 static VALUE 07282 rb_str_force_encoding(VALUE str, VALUE enc) 07283 { 07284 str_modifiable(str); 07285 rb_enc_associate(str, rb_to_encoding(enc)); 07286 ENC_CODERANGE_CLEAR(str); 07287 return str; 07288 } 07289 07290 /* 07291 * call-seq: 07292 * str.valid_encoding? -> true or false 07293 * 07294 * Returns true for a string which encoded correctly. 07295 * 07296 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true 07297 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false 07298 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false 07299 */ 07300 07301 static VALUE 07302 rb_str_valid_encoding_p(VALUE str) 07303 { 07304 int cr = rb_enc_str_coderange(str); 07305 07306 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue; 07307 } 07308 07309 /* 07310 * call-seq: 07311 * str.ascii_only? -> true or false 07312 * 07313 * Returns true for a string which has only ASCII characters. 07314 * 07315 * "abc".force_encoding("UTF-8").ascii_only? #=> true 07316 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false 07317 */ 07318 07319 static VALUE 07320 rb_str_is_ascii_only_p(VALUE str) 07321 { 07322 int cr = rb_enc_str_coderange(str); 07323 07324 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse; 07325 } 07326 07341 VALUE 07342 rb_str_ellipsize(VALUE str, long len) 07343 { 07344 static const char ellipsis[] = "..."; 07345 const long ellipsislen = sizeof(ellipsis) - 1; 07346 rb_encoding *const enc = rb_enc_get(str); 07347 const long blen = RSTRING_LEN(str); 07348 const char *const p = RSTRING_PTR(str), *e = p + blen; 07349 VALUE estr, ret = 0; 07350 07351 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); 07352 if (len * rb_enc_mbminlen(enc) >= blen || 07353 (e = rb_enc_nth(p, e, len, enc)) - p == blen) { 07354 ret = str; 07355 } 07356 else if (len <= ellipsislen || 07357 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) { 07358 if (rb_enc_asciicompat(enc)) { 07359 ret = rb_str_new_with_class(str, ellipsis, len); 07360 rb_enc_associate(ret, enc); 07361 } 07362 else { 07363 estr = rb_usascii_str_new(ellipsis, len); 07364 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil); 07365 } 07366 } 07367 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) { 07368 rb_str_cat(ret, ellipsis, ellipsislen); 07369 } 07370 else { 07371 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen), 07372 rb_enc_from_encoding(enc), 0, Qnil); 07373 rb_str_append(ret, estr); 07374 } 07375 return ret; 07376 } 07377 07378 /********************************************************************** 07379 * Document-class: Symbol 07380 * 07381 * <code>Symbol</code> objects represent names and some strings 07382 * inside the Ruby 07383 * interpreter. They are generated using the <code>:name</code> and 07384 * <code>:"string"</code> literals 07385 * syntax, and by the various <code>to_sym</code> methods. The same 07386 * <code>Symbol</code> object will be created for a given name or string 07387 * for the duration of a program's execution, regardless of the context 07388 * or meaning of that name. Thus if <code>Fred</code> is a constant in 07389 * one context, a method in another, and a class in a third, the 07390 * <code>Symbol</code> <code>:Fred</code> will be the same object in 07391 * all three contexts. 07392 * 07393 * module One 07394 * class Fred 07395 * end 07396 * $f1 = :Fred 07397 * end 07398 * module Two 07399 * Fred = 1 07400 * $f2 = :Fred 07401 * end 07402 * def Fred() 07403 * end 07404 * $f3 = :Fred 07405 * $f1.object_id #=> 2514190 07406 * $f2.object_id #=> 2514190 07407 * $f3.object_id #=> 2514190 07408 * 07409 */ 07410 07411 07412 /* 07413 * call-seq: 07414 * sym == obj -> true or false 07415 * 07416 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same 07417 * symbol, returns <code>true</code>. 07418 */ 07419 07420 static VALUE 07421 sym_equal(VALUE sym1, VALUE sym2) 07422 { 07423 if (sym1 == sym2) return Qtrue; 07424 return Qfalse; 07425 } 07426 07427 07428 static int 07429 sym_printable(const char *s, const char *send, rb_encoding *enc) 07430 { 07431 while (s < send) { 07432 int n; 07433 int c = rb_enc_codepoint_len(s, send, &n, enc); 07434 07435 if (!rb_enc_isprint(c, enc)) return FALSE; 07436 s += n; 07437 } 07438 return TRUE; 07439 } 07440 07441 /* 07442 * call-seq: 07443 * sym.inspect -> string 07444 * 07445 * Returns the representation of <i>sym</i> as a symbol literal. 07446 * 07447 * :fred.inspect #=> ":fred" 07448 */ 07449 07450 static VALUE 07451 sym_inspect(VALUE sym) 07452 { 07453 VALUE str; 07454 ID id = SYM2ID(sym); 07455 rb_encoding *enc; 07456 const char *ptr; 07457 long len; 07458 char *dest; 07459 rb_encoding *resenc = rb_default_internal_encoding(); 07460 07461 if (resenc == NULL) resenc = rb_default_external_encoding(); 07462 sym = rb_id2str(id); 07463 enc = STR_ENC_GET(sym); 07464 ptr = RSTRING_PTR(sym); 07465 len = RSTRING_LEN(sym); 07466 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) || 07467 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) { 07468 str = rb_str_inspect(sym); 07469 len = RSTRING_LEN(str); 07470 rb_str_resize(str, len + 1); 07471 dest = RSTRING_PTR(str); 07472 memmove(dest + 1, dest, len); 07473 dest[0] = ':'; 07474 } 07475 else { 07476 char *dest; 07477 str = rb_enc_str_new(0, len + 1, enc); 07478 dest = RSTRING_PTR(str); 07479 dest[0] = ':'; 07480 memcpy(dest + 1, ptr, len); 07481 } 07482 return str; 07483 } 07484 07485 07486 /* 07487 * call-seq: 07488 * sym.id2name -> string 07489 * sym.to_s -> string 07490 * 07491 * Returns the name or string corresponding to <i>sym</i>. 07492 * 07493 * :fred.id2name #=> "fred" 07494 */ 07495 07496 07497 VALUE 07498 rb_sym_to_s(VALUE sym) 07499 { 07500 ID id = SYM2ID(sym); 07501 07502 return str_new3(rb_cString, rb_id2str(id)); 07503 } 07504 07505 07506 /* 07507 * call-seq: 07508 * sym.to_sym -> sym 07509 * sym.intern -> sym 07510 * 07511 * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding 07512 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned 07513 * in this case. 07514 */ 07515 07516 static VALUE 07517 sym_to_sym(VALUE sym) 07518 { 07519 return sym; 07520 } 07521 07522 static VALUE 07523 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv, VALUE passed_proc) 07524 { 07525 VALUE obj; 07526 07527 if (argc < 1) { 07528 rb_raise(rb_eArgError, "no receiver given"); 07529 } 07530 obj = argv[0]; 07531 return rb_funcall_with_block(obj, (ID)sym, argc - 1, argv + 1, passed_proc); 07532 } 07533 07534 /* 07535 * call-seq: 07536 * sym.to_proc 07537 * 07538 * Returns a _Proc_ object which respond to the given method by _sym_. 07539 * 07540 * (1..3).collect(&:to_s) #=> ["1", "2", "3"] 07541 */ 07542 07543 static VALUE 07544 sym_to_proc(VALUE sym) 07545 { 07546 static VALUE sym_proc_cache = Qfalse; 07547 enum {SYM_PROC_CACHE_SIZE = 67}; 07548 VALUE proc; 07549 long id, index; 07550 VALUE *aryp; 07551 07552 if (!sym_proc_cache) { 07553 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2); 07554 rb_gc_register_mark_object(sym_proc_cache); 07555 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil); 07556 } 07557 07558 id = SYM2ID(sym); 07559 index = (id % SYM_PROC_CACHE_SIZE) << 1; 07560 07561 aryp = RARRAY_PTR(sym_proc_cache); 07562 if (aryp[index] == sym) { 07563 return aryp[index + 1]; 07564 } 07565 else { 07566 proc = rb_proc_new(sym_call, (VALUE)id); 07567 aryp[index] = sym; 07568 aryp[index + 1] = proc; 07569 return proc; 07570 } 07571 } 07572 07573 /* 07574 * call-seq: 07575 * 07576 * sym.succ 07577 * 07578 * Same as <code>sym.to_s.succ.intern</code>. 07579 */ 07580 07581 static VALUE 07582 sym_succ(VALUE sym) 07583 { 07584 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym))); 07585 } 07586 07587 /* 07588 * call-seq: 07589 * 07590 * str <=> other -> -1, 0, +1 or nil 07591 * 07592 * Compares _sym_ with _other_ in string form. 07593 */ 07594 07595 static VALUE 07596 sym_cmp(VALUE sym, VALUE other) 07597 { 07598 if (!SYMBOL_P(other)) { 07599 return Qnil; 07600 } 07601 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other)); 07602 } 07603 07604 /* 07605 * call-seq: 07606 * 07607 * sym.casecmp(other) -> -1, 0, +1 or nil 07608 * 07609 * Case-insensitive version of <code>Symbol#<=></code>. 07610 */ 07611 07612 static VALUE 07613 sym_casecmp(VALUE sym, VALUE other) 07614 { 07615 if (!SYMBOL_P(other)) { 07616 return Qnil; 07617 } 07618 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other)); 07619 } 07620 07621 /* 07622 * call-seq: 07623 * sym =~ obj -> fixnum or nil 07624 * 07625 * Returns <code>sym.to_s =~ obj</code>. 07626 */ 07627 07628 static VALUE 07629 sym_match(VALUE sym, VALUE other) 07630 { 07631 return rb_str_match(rb_sym_to_s(sym), other); 07632 } 07633 07634 /* 07635 * call-seq: 07636 * sym[idx] -> char 07637 * sym[b, n] -> char 07638 * 07639 * Returns <code>sym.to_s[]</code>. 07640 */ 07641 07642 static VALUE 07643 sym_aref(int argc, VALUE *argv, VALUE sym) 07644 { 07645 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym)); 07646 } 07647 07648 /* 07649 * call-seq: 07650 * sym.length -> integer 07651 * 07652 * Same as <code>sym.to_s.length</code>. 07653 */ 07654 07655 static VALUE 07656 sym_length(VALUE sym) 07657 { 07658 return rb_str_length(rb_id2str(SYM2ID(sym))); 07659 } 07660 07661 /* 07662 * call-seq: 07663 * sym.empty? -> true or false 07664 * 07665 * Returns that _sym_ is :"" or not. 07666 */ 07667 07668 static VALUE 07669 sym_empty(VALUE sym) 07670 { 07671 return rb_str_empty(rb_id2str(SYM2ID(sym))); 07672 } 07673 07674 /* 07675 * call-seq: 07676 * sym.upcase -> symbol 07677 * 07678 * Same as <code>sym.to_s.upcase.intern</code>. 07679 */ 07680 07681 static VALUE 07682 sym_upcase(VALUE sym) 07683 { 07684 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym)))); 07685 } 07686 07687 /* 07688 * call-seq: 07689 * sym.downcase -> symbol 07690 * 07691 * Same as <code>sym.to_s.downcase.intern</code>. 07692 */ 07693 07694 static VALUE 07695 sym_downcase(VALUE sym) 07696 { 07697 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym)))); 07698 } 07699 07700 /* 07701 * call-seq: 07702 * sym.capitalize -> symbol 07703 * 07704 * Same as <code>sym.to_s.capitalize.intern</code>. 07705 */ 07706 07707 static VALUE 07708 sym_capitalize(VALUE sym) 07709 { 07710 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym)))); 07711 } 07712 07713 /* 07714 * call-seq: 07715 * sym.swapcase -> symbol 07716 * 07717 * Same as <code>sym.to_s.swapcase.intern</code>. 07718 */ 07719 07720 static VALUE 07721 sym_swapcase(VALUE sym) 07722 { 07723 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym)))); 07724 } 07725 07726 /* 07727 * call-seq: 07728 * sym.encoding -> encoding 07729 * 07730 * Returns the Encoding object that represents the encoding of _sym_. 07731 */ 07732 07733 static VALUE 07734 sym_encoding(VALUE sym) 07735 { 07736 return rb_obj_encoding(rb_id2str(SYM2ID(sym))); 07737 } 07738 07739 ID 07740 rb_to_id(VALUE name) 07741 { 07742 VALUE tmp; 07743 07744 switch (TYPE(name)) { 07745 default: 07746 tmp = rb_check_string_type(name); 07747 if (NIL_P(tmp)) { 07748 tmp = rb_inspect(name); 07749 rb_raise(rb_eTypeError, "%s is not a symbol", 07750 RSTRING_PTR(tmp)); 07751 } 07752 name = tmp; 07753 /* fall through */ 07754 case T_STRING: 07755 name = rb_str_intern(name); 07756 /* fall through */ 07757 case T_SYMBOL: 07758 return SYM2ID(name); 07759 } 07760 return Qnil; /* not reached */ 07761 } 07762 07763 /* 07764 * A <code>String</code> object holds and manipulates an arbitrary sequence of 07765 * bytes, typically representing characters. String objects may be created 07766 * using <code>String::new</code> or as literals. 07767 * 07768 * Because of aliasing issues, users of strings should be aware of the methods 07769 * that modify the contents of a <code>String</code> object. Typically, 07770 * methods with names ending in ``!'' modify their receiver, while those 07771 * without a ``!'' return a new <code>String</code>. However, there are 07772 * exceptions, such as <code>String#[]=</code>. 07773 * 07774 */ 07775 07776 void 07777 Init_String(void) 07778 { 07779 #undef rb_intern 07780 #define rb_intern(str) rb_intern_const(str) 07781 07782 rb_cString = rb_define_class("String", rb_cObject); 07783 rb_include_module(rb_cString, rb_mComparable); 07784 rb_define_alloc_func(rb_cString, str_alloc); 07785 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1); 07786 rb_define_method(rb_cString, "initialize", rb_str_init, -1); 07787 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1); 07788 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1); 07789 rb_define_method(rb_cString, "==", rb_str_equal, 1); 07790 rb_define_method(rb_cString, "===", rb_str_equal, 1); 07791 rb_define_method(rb_cString, "eql?", rb_str_eql, 1); 07792 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0); 07793 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1); 07794 rb_define_method(rb_cString, "+", rb_str_plus, 1); 07795 rb_define_method(rb_cString, "*", rb_str_times, 1); 07796 rb_define_method(rb_cString, "%", rb_str_format_m, 1); 07797 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1); 07798 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1); 07799 rb_define_method(rb_cString, "insert", rb_str_insert, 2); 07800 rb_define_method(rb_cString, "length", rb_str_length, 0); 07801 rb_define_method(rb_cString, "size", rb_str_length, 0); 07802 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0); 07803 rb_define_method(rb_cString, "empty?", rb_str_empty, 0); 07804 rb_define_method(rb_cString, "=~", rb_str_match, 1); 07805 rb_define_method(rb_cString, "match", rb_str_match_m, -1); 07806 rb_define_method(rb_cString, "succ", rb_str_succ, 0); 07807 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0); 07808 rb_define_method(rb_cString, "next", rb_str_succ, 0); 07809 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0); 07810 rb_define_method(rb_cString, "upto", rb_str_upto, -1); 07811 rb_define_method(rb_cString, "index", rb_str_index_m, -1); 07812 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1); 07813 rb_define_method(rb_cString, "replace", rb_str_replace, 1); 07814 rb_define_method(rb_cString, "clear", rb_str_clear, 0); 07815 rb_define_method(rb_cString, "chr", rb_str_chr, 0); 07816 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1); 07817 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2); 07818 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1); 07819 07820 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1); 07821 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0); 07822 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0); 07823 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); 07824 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); 07825 rb_define_method(rb_cString, "dump", rb_str_dump, 0); 07826 07827 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0); 07828 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0); 07829 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0); 07830 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0); 07831 07832 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0); 07833 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0); 07834 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0); 07835 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0); 07836 07837 rb_define_method(rb_cString, "hex", rb_str_hex, 0); 07838 rb_define_method(rb_cString, "oct", rb_str_oct, 0); 07839 rb_define_method(rb_cString, "split", rb_str_split_m, -1); 07840 rb_define_method(rb_cString, "lines", rb_str_each_line, -1); 07841 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0); 07842 rb_define_method(rb_cString, "chars", rb_str_each_char, 0); 07843 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0); 07844 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0); 07845 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0); 07846 rb_define_method(rb_cString, "concat", rb_str_concat, 1); 07847 rb_define_method(rb_cString, "<<", rb_str_concat, 1); 07848 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1); 07849 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1); 07850 rb_define_method(rb_cString, "intern", rb_str_intern, 0); 07851 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); 07852 rb_define_method(rb_cString, "ord", rb_str_ord, 0); 07853 07854 rb_define_method(rb_cString, "include?", rb_str_include, 1); 07855 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1); 07856 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1); 07857 07858 rb_define_method(rb_cString, "scan", rb_str_scan, 1); 07859 07860 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1); 07861 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1); 07862 rb_define_method(rb_cString, "center", rb_str_center, -1); 07863 07864 rb_define_method(rb_cString, "sub", rb_str_sub, -1); 07865 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1); 07866 rb_define_method(rb_cString, "chop", rb_str_chop, 0); 07867 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1); 07868 rb_define_method(rb_cString, "strip", rb_str_strip, 0); 07869 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0); 07870 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0); 07871 07872 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1); 07873 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1); 07874 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0); 07875 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1); 07876 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0); 07877 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0); 07878 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0); 07879 07880 rb_define_method(rb_cString, "tr", rb_str_tr, 2); 07881 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2); 07882 rb_define_method(rb_cString, "delete", rb_str_delete, -1); 07883 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1); 07884 rb_define_method(rb_cString, "count", rb_str_count, -1); 07885 07886 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2); 07887 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2); 07888 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1); 07889 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1); 07890 07891 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1); 07892 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0); 07893 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0); 07894 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0); 07895 07896 rb_define_method(rb_cString, "sum", rb_str_sum, -1); 07897 07898 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1); 07899 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1); 07900 07901 rb_define_method(rb_cString, "partition", rb_str_partition, 1); 07902 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1); 07903 07904 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ 07905 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); 07906 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); 07907 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0); 07908 07909 id_to_s = rb_intern("to_s"); 07910 07911 rb_fs = Qnil; 07912 rb_define_variable("$;", &rb_fs); 07913 rb_define_variable("$-F", &rb_fs); 07914 07915 rb_cSymbol = rb_define_class("Symbol", rb_cObject); 07916 rb_include_module(rb_cSymbol, rb_mComparable); 07917 rb_undef_alloc_func(rb_cSymbol); 07918 rb_undef_method(CLASS_OF(rb_cSymbol), "new"); 07919 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in parse.y */ 07920 07921 rb_define_method(rb_cSymbol, "==", sym_equal, 1); 07922 rb_define_method(rb_cSymbol, "===", sym_equal, 1); 07923 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0); 07924 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0); 07925 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0); 07926 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0); 07927 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0); 07928 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0); 07929 rb_define_method(rb_cSymbol, "succ", sym_succ, 0); 07930 rb_define_method(rb_cSymbol, "next", sym_succ, 0); 07931 07932 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1); 07933 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1); 07934 rb_define_method(rb_cSymbol, "=~", sym_match, 1); 07935 07936 rb_define_method(rb_cSymbol, "[]", sym_aref, -1); 07937 rb_define_method(rb_cSymbol, "slice", sym_aref, -1); 07938 rb_define_method(rb_cSymbol, "length", sym_length, 0); 07939 rb_define_method(rb_cSymbol, "size", sym_length, 0); 07940 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0); 07941 rb_define_method(rb_cSymbol, "match", sym_match, 1); 07942 07943 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0); 07944 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0); 07945 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0); 07946 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0); 07947 07948 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0); 07949 } 07950
1.7.6.1