Ruby 3.1.4p223 (2023-03-30 revision HEAD)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "gc.h"
27#include "id.h"
28#include "internal.h"
29#include "internal/array.h"
30#include "internal/compar.h"
31#include "internal/compilers.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/util.h"
46#include "ruby_assert.h"
47#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_tainted_str_new_cstr
67#undef rb_usascii_str_new_cstr
68#undef rb_utf8_str_new_cstr
69#undef rb_enc_str_new_cstr
70#undef rb_external_str_new_cstr
71#undef rb_locale_str_new_cstr
72#undef rb_str_dup_frozen
73#undef rb_str_buf_new_cstr
74#undef rb_str_buf_cat
75#undef rb_str_buf_cat2
76#undef rb_str_cat2
77#undef rb_str_cat_cstr
78#undef rb_fstring_cstr
79
82
83/* FLAGS of RString
84 *
85 * 1: RSTRING_NOEMBED
86 * 2: STR_SHARED (== ELTS_SHARED)
87 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
88 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
89 * other strings that rely on this string's buffer)
90 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
91 * early, specific to rb_str_tmp_frozen_{acquire,release})
92 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
93 * such as read(2). Any modification and realloc is prohibited)
94 *
95 * 8-9: ENC_CODERANGE (2 bits)
96 * 10-16: ENCODING (7 bits == 128)
97 * 17: RSTRING_FSTR
98 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
99 * used for a string object based on C string literal)
100 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
101 * object header is temporarily allocated on C stack)
102 */
103
104#define RUBY_MAX_CHAR_LEN 16
105#define STR_SHARED_ROOT FL_USER5
106#define STR_BORROWED FL_USER6
107#define STR_TMPLOCK FL_USER7
108#define STR_NOFREE FL_USER18
109#define STR_FAKESTR FL_USER19
110
111#define STR_SET_NOEMBED(str) do {\
112 FL_SET((str), STR_NOEMBED);\
113 if (USE_RVARGC) {\
114 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
115 }\
116 else {\
117 STR_SET_EMBED_LEN((str), 0);\
118 }\
119} while (0)
120#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
121#if USE_RVARGC
122# define STR_SET_EMBED_LEN(str, n) do { \
123 assert(str_embed_capa(str) > (n));\
124 RSTRING(str)->as.embed.len = (n);\
125} while (0)
126#else
127# define STR_SET_EMBED_LEN(str, n) do { \
128 long tmp_n = (n);\
129 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
130 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
131} while (0)
132#endif
133
134#define STR_SET_LEN(str, n) do { \
135 if (STR_EMBED_P(str)) {\
136 STR_SET_EMBED_LEN((str), (n));\
137 }\
138 else {\
139 RSTRING(str)->as.heap.len = (n);\
140 }\
141} while (0)
142
143#define STR_DEC_LEN(str) do {\
144 if (STR_EMBED_P(str)) {\
145 long n = RSTRING_LEN(str);\
146 n--;\
147 STR_SET_EMBED_LEN((str), n);\
148 }\
149 else {\
150 RSTRING(str)->as.heap.len--;\
151 }\
152} while (0)
153
154#define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
155#define TERM_FILL(ptr, termlen) do {\
156 char *const term_fill_ptr = (ptr);\
157 const int term_fill_len = (termlen);\
158 *term_fill_ptr = '\0';\
159 if (UNLIKELY(term_fill_len > 1))\
160 memset(term_fill_ptr, 0, term_fill_len);\
161} while (0)
162
163#define RESIZE_CAPA(str,capacity) do {\
164 const int termlen = TERM_LEN(str);\
165 RESIZE_CAPA_TERM(str,capacity,termlen);\
166} while (0)
167#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
168 if (STR_EMBED_P(str)) {\
169 if (str_embed_capa(str) < capacity + termlen) {\
170 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
171 const long tlen = RSTRING_LEN(str);\
172 memcpy(tmp, RSTRING_PTR(str), tlen);\
173 RSTRING(str)->as.heap.ptr = tmp;\
174 RSTRING(str)->as.heap.len = tlen;\
175 STR_SET_NOEMBED(str);\
176 RSTRING(str)->as.heap.aux.capa = (capacity);\
177 }\
178 }\
179 else {\
180 assert(!FL_TEST((str), STR_SHARED)); \
181 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
182 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
183 RSTRING(str)->as.heap.aux.capa = (capacity);\
184 }\
185} while (0)
186
187#define STR_SET_SHARED(str, shared_str) do { \
188 if (!FL_TEST(str, STR_FAKESTR)) { \
189 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
190 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
191 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
192 FL_SET((str), STR_SHARED); \
193 FL_SET((shared_str), STR_SHARED_ROOT); \
194 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
195 FL_SET_RAW((shared_str), STR_BORROWED); \
196 } \
197} while (0)
198
199#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
200#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
201/* TODO: include the terminator size in capa. */
202
203#define STR_ENC_GET(str) get_encoding(str)
204
205#if !defined SHARABLE_MIDDLE_SUBSTRING
206# define SHARABLE_MIDDLE_SUBSTRING 0
207#endif
208#if !SHARABLE_MIDDLE_SUBSTRING
209#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
210#else
211#define SHARABLE_SUBSTRING_P(beg, len, end) 1
212#endif
213
214
215static inline long
216str_embed_capa(VALUE str)
217{
218#if USE_RVARGC
219 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
220#else
221 return RSTRING_EMBED_LEN_MAX + 1;
222#endif
223}
224
225static inline size_t
226str_embed_size(long capa)
227{
228 return offsetof(struct RString, as.embed.ary) + capa;
229}
230
231static inline bool
232STR_EMBEDDABLE_P(long len, long termlen)
233{
234#if USE_RVARGC
235 return rb_gc_size_allocatable_p(str_embed_size(len + termlen));
236#else
237 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
238#endif
239}
240
241static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
242static VALUE str_new_frozen(VALUE klass, VALUE orig);
243static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
244static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
245static VALUE str_new(VALUE klass, const char *ptr, long len);
246static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
247static inline void str_modifiable(VALUE str);
248static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
249
250static inline void
251str_make_independent(VALUE str)
252{
253 long len = RSTRING_LEN(str);
254 int termlen = TERM_LEN(str);
255 str_make_independent_expand((str), len, 0L, termlen);
256}
257
258static inline int str_dependent_p(VALUE str);
259
260void
261rb_str_make_independent(VALUE str)
262{
263 if (str_dependent_p(str)) {
264 str_make_independent(str);
265 }
266}
267
268void
269rb_debug_rstring_null_ptr(const char *func)
270{
271 fprintf(stderr, "%s is returning NULL!! "
272 "SIGSEGV is highly expected to follow immediately. "
273 "If you could reproduce, attach your debugger here, "
274 "and look at the passed string.",
275 func);
276}
277
278/* symbols for [up|down|swap]case/capitalize options */
279static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
280
281static rb_encoding *
282get_actual_encoding(const int encidx, VALUE str)
283{
284 const unsigned char *q;
285
286 switch (encidx) {
287 case ENCINDEX_UTF_16:
288 if (RSTRING_LEN(str) < 2) break;
289 q = (const unsigned char *)RSTRING_PTR(str);
290 if (q[0] == 0xFE && q[1] == 0xFF) {
291 return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
292 }
293 if (q[0] == 0xFF && q[1] == 0xFE) {
294 return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
295 }
296 return rb_ascii8bit_encoding();
297 case ENCINDEX_UTF_32:
298 if (RSTRING_LEN(str) < 4) break;
299 q = (const unsigned char *)RSTRING_PTR(str);
300 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
301 return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
302 }
303 if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
304 return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
305 }
306 return rb_ascii8bit_encoding();
307 }
308 return rb_enc_from_index(encidx);
309}
310
311static rb_encoding *
312get_encoding(VALUE str)
313{
314 return get_actual_encoding(ENCODING_GET(str), str);
315}
316
317static void
318mustnot_broken(VALUE str)
319{
320 if (is_broken_string(str)) {
321 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
322 }
323}
324
325static void
326mustnot_wchar(VALUE str)
327{
328 rb_encoding *enc = STR_ENC_GET(str);
329 if (rb_enc_mbminlen(enc) > 1) {
330 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
331 }
332}
333
334static int fstring_cmp(VALUE a, VALUE b);
335
336static VALUE register_fstring(VALUE str, bool copy);
337
338const struct st_hash_type rb_fstring_hash_type = {
339 fstring_cmp,
341};
342
343#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
344
346 VALUE fstr;
347 bool copy;
348};
349
350static int
351fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
352{
353
354 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
355 VALUE str = (VALUE)*key;
356
357 if (existing) {
358 /* because of lazy sweep, str may be unmarked already and swept
359 * at next time */
360
361 if (rb_objspace_garbage_object_p(str)) {
362 arg->fstr = Qundef;
363 return ST_DELETE;
364 }
365
366 arg->fstr = str;
367 return ST_STOP;
368 }
369 else {
370 if (FL_TEST_RAW(str, STR_FAKESTR)) {
371 if (arg->copy) {
372 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
373 rb_enc_copy(new_str, str);
374 str = new_str;
375 }
376 else {
377 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
378 RSTRING(str)->as.heap.len,
379 ENCODING_GET(str));
380 }
381 OBJ_FREEZE_RAW(str);
382 }
383 else {
384 if (!OBJ_FROZEN(str))
385 str = str_new_frozen(rb_cString, str);
386 if (STR_SHARED_P(str)) { /* str should not be shared */
387 /* shared substring */
388 str_make_independent(str);
389 assert(OBJ_FROZEN(str));
390 }
391 if (!BARE_STRING_P(str)) {
392 str = str_new_frozen(rb_cString, str);
393 }
394 }
395 RBASIC(str)->flags |= RSTRING_FSTR;
396
397 *key = *value = arg->fstr = str;
398 return ST_CONTINUE;
399 }
400}
401
402RUBY_FUNC_EXPORTED
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE_RAW(str);
418 return str;
419 }
420 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
421 assert(OBJ_FROZEN(str));
422 return str;
423 }
424 }
425
426 if (!OBJ_FROZEN(str))
427 rb_str_resize(str, RSTRING_LEN(str));
428
429 fstr = register_fstring(str, FALSE);
430
431 if (!bare) {
432 str_replace_shared_without_enc(str, fstr);
433 OBJ_FREEZE_RAW(str);
434 return str;
435 }
436 return fstr;
437}
438
439static VALUE
440register_fstring(VALUE str, bool copy)
441{
442 struct fstr_update_arg args;
443 args.copy = copy;
444
445 RB_VM_LOCK_ENTER();
446 {
447 st_table *frozen_strings = rb_vm_fstring_table();
448 do {
449 args.fstr = str;
450 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
451 } while (args.fstr == Qundef);
452 }
453 RB_VM_LOCK_LEAVE();
454
455 assert(OBJ_FROZEN(args.fstr));
456 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
457 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
458 assert(RBASIC_CLASS(args.fstr) == rb_cString);
459 return args.fstr;
460}
461
462static VALUE
463setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
464{
465 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
466 /* SHARED to be allocated by the callback */
467
468 if (!name) {
469 RUBY_ASSERT_ALWAYS(len == 0);
470 name = "";
471 }
472
473 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
474
475 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
476 fake_str->as.heap.len = len;
477 fake_str->as.heap.ptr = (char *)name;
478 fake_str->as.heap.aux.capa = len;
479 return (VALUE)fake_str;
480}
481
482/*
483 * set up a fake string which refers a static string literal.
484 */
485VALUE
486rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
487{
488 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
489}
490
491/*
492 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
493 * shared string which refers a static string literal. `ptr` must
494 * point a constant string.
495 */
496MJIT_FUNC_EXPORTED VALUE
497rb_fstring_new(const char *ptr, long len)
498{
499 struct RString fake_str;
500 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
501}
502
503VALUE
504rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
505{
506 struct RString fake_str;
507 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
508}
509
510VALUE
511rb_fstring_cstr(const char *ptr)
512{
513 return rb_fstring_new(ptr, strlen(ptr));
514}
515
516static int
517fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
518{
519 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
520 return ST_CONTINUE;
521}
522
523static int
524fstring_cmp(VALUE a, VALUE b)
525{
526 long alen, blen;
527 const char *aptr, *bptr;
528 RSTRING_GETMEM(a, aptr, alen);
529 RSTRING_GETMEM(b, bptr, blen);
530 return (alen != blen ||
531 ENCODING_GET(a) != ENCODING_GET(b) ||
532 memcmp(aptr, bptr, alen) != 0);
533}
534
535static inline int
536single_byte_optimizable(VALUE str)
537{
538 rb_encoding *enc;
539
540 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
542 return 1;
543
544 enc = STR_ENC_GET(str);
545 if (rb_enc_mbmaxlen(enc) == 1)
546 return 1;
547
548 /* Conservative. Possibly single byte.
549 * "\xa1" in Shift_JIS for example. */
550 return 0;
551}
552
554
555static inline const char *
556search_nonascii(const char *p, const char *e)
557{
558 const uintptr_t *s, *t;
559
560#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
561# if SIZEOF_UINTPTR_T == 8
562# define NONASCII_MASK UINT64_C(0x8080808080808080)
563# elif SIZEOF_UINTPTR_T == 4
564# define NONASCII_MASK UINT32_C(0x80808080)
565# else
566# error "don't know what to do."
567# endif
568#else
569# if SIZEOF_UINTPTR_T == 8
570# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
571# elif SIZEOF_UINTPTR_T == 4
572# define NONASCII_MASK 0x80808080UL /* or...? */
573# else
574# error "don't know what to do."
575# endif
576#endif
577
578 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
579#if !UNALIGNED_WORD_ACCESS
580 if ((uintptr_t)p % SIZEOF_VOIDP) {
581 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
582 p += l;
583 switch (l) {
584 default: UNREACHABLE;
585#if SIZEOF_VOIDP > 4
586 case 7: if (p[-7]&0x80) return p-7;
587 case 6: if (p[-6]&0x80) return p-6;
588 case 5: if (p[-5]&0x80) return p-5;
589 case 4: if (p[-4]&0x80) return p-4;
590#endif
591 case 3: if (p[-3]&0x80) return p-3;
592 case 2: if (p[-2]&0x80) return p-2;
593 case 1: if (p[-1]&0x80) return p-1;
594 case 0: break;
595 }
596 }
597#endif
598#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
599#define aligned_ptr(value) \
600 __builtin_assume_aligned((value), sizeof(uintptr_t))
601#else
602#define aligned_ptr(value) (uintptr_t *)(value)
603#endif
604 s = aligned_ptr(p);
605 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
606#undef aligned_ptr
607 for (;s < t; s++) {
608 if (*s & NONASCII_MASK) {
609#ifdef WORDS_BIGENDIAN
610 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
611#else
612 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
613#endif
614 }
615 }
616 p = (const char *)s;
617 }
618
619 switch (e - p) {
620 default: UNREACHABLE;
621#if SIZEOF_VOIDP > 4
622 case 7: if (e[-7]&0x80) return e-7;
623 case 6: if (e[-6]&0x80) return e-6;
624 case 5: if (e[-5]&0x80) return e-5;
625 case 4: if (e[-4]&0x80) return e-4;
626#endif
627 case 3: if (e[-3]&0x80) return e-3;
628 case 2: if (e[-2]&0x80) return e-2;
629 case 1: if (e[-1]&0x80) return e-1;
630 case 0: return NULL;
631 }
632}
633
634static int
635coderange_scan(const char *p, long len, rb_encoding *enc)
636{
637 const char *e = p + len;
638
639 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
640 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
641 p = search_nonascii(p, e);
643 }
644
645 if (rb_enc_asciicompat(enc)) {
646 p = search_nonascii(p, e);
647 if (!p) return ENC_CODERANGE_7BIT;
648 for (;;) {
649 int ret = rb_enc_precise_mbclen(p, e, enc);
651 p += MBCLEN_CHARFOUND_LEN(ret);
652 if (p == e) break;
653 p = search_nonascii(p, e);
654 if (!p) break;
655 }
656 }
657 else {
658 while (p < e) {
659 int ret = rb_enc_precise_mbclen(p, e, enc);
661 p += MBCLEN_CHARFOUND_LEN(ret);
662 }
663 }
664 return ENC_CODERANGE_VALID;
665}
666
667long
668rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
669{
670 const char *p = s;
671
672 if (*cr == ENC_CODERANGE_BROKEN)
673 return e - s;
674
675 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
676 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
677 if (*cr == ENC_CODERANGE_VALID) return e - s;
678 p = search_nonascii(p, e);
680 return e - s;
681 }
682 else if (rb_enc_asciicompat(enc)) {
683 p = search_nonascii(p, e);
684 if (!p) {
685 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
686 return e - s;
687 }
688 for (;;) {
689 int ret = rb_enc_precise_mbclen(p, e, enc);
690 if (!MBCLEN_CHARFOUND_P(ret)) {
692 return p - s;
693 }
694 p += MBCLEN_CHARFOUND_LEN(ret);
695 if (p == e) break;
696 p = search_nonascii(p, e);
697 if (!p) break;
698 }
699 }
700 else {
701 while (p < e) {
702 int ret = rb_enc_precise_mbclen(p, e, enc);
703 if (!MBCLEN_CHARFOUND_P(ret)) {
705 return p - s;
706 }
707 p += MBCLEN_CHARFOUND_LEN(ret);
708 }
709 }
711 return e - s;
712}
713
714static inline void
715str_enc_copy(VALUE str1, VALUE str2)
716{
717 rb_enc_set_index(str1, ENCODING_GET(str2));
718}
719
720static void
721rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
722{
723 /* this function is designed for copying encoding and coderange
724 * from src to new string "dest" which is made from the part of src.
725 */
726 str_enc_copy(dest, src);
727 if (RSTRING_LEN(dest) == 0) {
728 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
730 else
732 return;
733 }
734 switch (ENC_CODERANGE(src)) {
737 break;
739 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
740 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
742 else
744 break;
745 default:
746 break;
747 }
748}
749
750static void
751rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
752{
753 str_enc_copy(dest, src);
755}
756
757static int
758enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
759{
760 if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
761 rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
763 }
764 else {
765 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
766 }
767}
768
769int
770rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
771{
772 return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
773}
774
775int
777{
778 int cr = ENC_CODERANGE(str);
779
780 if (cr == ENC_CODERANGE_UNKNOWN) {
781 int encidx = ENCODING_GET(str);
782 rb_encoding *enc = rb_enc_from_index(encidx);
783 cr = enc_coderange_scan(str, enc, encidx);
784 ENC_CODERANGE_SET(str, cr);
785 }
786 return cr;
787}
788
789int
791{
792 rb_encoding *enc = STR_ENC_GET(str);
793
794 if (!rb_enc_asciicompat(enc))
795 return FALSE;
797 return TRUE;
798 return FALSE;
799}
800
801static inline void
802str_mod_check(VALUE s, const char *p, long len)
803{
804 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
805 rb_raise(rb_eRuntimeError, "string modified");
806 }
807}
808
809static size_t
810str_capacity(VALUE str, const int termlen)
811{
812 if (STR_EMBED_P(str)) {
813#if USE_RVARGC
814 return str_embed_capa(str) - termlen;
815#else
816 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
817#endif
818 }
819 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
820 return RSTRING(str)->as.heap.len;
821 }
822 else {
823 return RSTRING(str)->as.heap.aux.capa;
824 }
825}
826
827size_t
829{
830 return str_capacity(str, TERM_LEN(str));
831}
832
833static inline void
834must_not_null(const char *ptr)
835{
836 if (!ptr) {
837 rb_raise(rb_eArgError, "NULL pointer given");
838 }
839}
840
841static inline VALUE
842str_alloc(VALUE klass, size_t size)
843{
844 assert(size > 0);
845 RVARGC_NEWOBJ_OF(str, struct RString, klass,
847 return (VALUE)str;
848}
849
850static inline VALUE
851str_alloc_embed(VALUE klass, size_t capa)
852{
853 size_t size = str_embed_size(capa);
854 assert(rb_gc_size_allocatable_p(size));
855#if !USE_RVARGC
856 assert(size <= sizeof(struct RString));
857#endif
858 return str_alloc(klass, size);
859}
860
861static inline VALUE
862str_alloc_heap(VALUE klass)
863{
864 return str_alloc(klass, sizeof(struct RString));
865}
866
867static inline VALUE
868empty_str_alloc(VALUE klass)
869{
870 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
871 VALUE str = str_alloc_embed(klass, 0);
872 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
873 return str;
874}
875
876static VALUE
877str_new0(VALUE klass, const char *ptr, long len, int termlen)
878{
879 VALUE str;
880
881 if (len < 0) {
882 rb_raise(rb_eArgError, "negative string size (or size too big)");
883 }
884
885 RUBY_DTRACE_CREATE_HOOK(STRING, len);
886
887 if (STR_EMBEDDABLE_P(len, termlen)) {
888 str = str_alloc_embed(klass, len + termlen);
889 if (len == 0) {
891 }
892 }
893 else {
894 str = str_alloc_heap(klass);
895 RSTRING(str)->as.heap.aux.capa = len;
896 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
897 * integer overflow. If we can STATIC_ASSERT that, the following
898 * mul_add_mul can be reverted to a simple ALLOC_N. */
899 RSTRING(str)->as.heap.ptr =
900 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
901 STR_SET_NOEMBED(str);
902 }
903 if (ptr) {
904 memcpy(RSTRING_PTR(str), ptr, len);
905 }
906 STR_SET_LEN(str, len);
907 TERM_FILL(RSTRING_PTR(str) + len, termlen);
908 return str;
909}
910
911static VALUE
912str_new(VALUE klass, const char *ptr, long len)
913{
914 return str_new0(klass, ptr, len, 1);
915}
916
917VALUE
918rb_str_new(const char *ptr, long len)
919{
920 return str_new(rb_cString, ptr, len);
921}
922
923VALUE
924rb_usascii_str_new(const char *ptr, long len)
925{
926 VALUE str = rb_str_new(ptr, len);
927 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
928 return str;
929}
930
931VALUE
932rb_utf8_str_new(const char *ptr, long len)
933{
934 VALUE str = str_new(rb_cString, ptr, len);
935 rb_enc_associate_index(str, rb_utf8_encindex());
936 return str;
937}
938
939VALUE
940rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
941{
942 VALUE str;
943
944 if (!enc) return rb_str_new(ptr, len);
945
946 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
947 rb_enc_associate(str, enc);
948 return str;
949}
950
951VALUE
953{
954 must_not_null(ptr);
955 /* rb_str_new_cstr() can take pointer from non-malloc-generated
956 * memory regions, and that cannot be detected by the MSAN. Just
957 * trust the programmer that the argument passed here is a sane C
958 * string. */
959 __msan_unpoison_string(ptr);
960 return rb_str_new(ptr, strlen(ptr));
961}
962
963VALUE
965{
967 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
968 return str;
969}
970
971VALUE
973{
975 rb_enc_associate_index(str, rb_utf8_encindex());
976 return str;
977}
978
979VALUE
981{
982 must_not_null(ptr);
983 if (rb_enc_mbminlen(enc) != 1) {
984 rb_raise(rb_eArgError, "wchar encoding given");
985 }
986 return rb_enc_str_new(ptr, strlen(ptr), enc);
987}
988
989static VALUE
990str_new_static(VALUE klass, const char *ptr, long len, int encindex)
991{
992 VALUE str;
993
994 if (len < 0) {
995 rb_raise(rb_eArgError, "negative string size (or size too big)");
996 }
997
998 if (!ptr) {
999 rb_encoding *enc = rb_enc_get_from_index(encindex);
1000 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1001 }
1002 else {
1003 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1004 str = str_alloc_heap(klass);
1005 RSTRING(str)->as.heap.len = len;
1006 RSTRING(str)->as.heap.ptr = (char *)ptr;
1007 RSTRING(str)->as.heap.aux.capa = len;
1008 STR_SET_NOEMBED(str);
1009 RBASIC(str)->flags |= STR_NOFREE;
1010 }
1011 rb_enc_associate_index(str, encindex);
1012 return str;
1013}
1014
1015VALUE
1016rb_str_new_static(const char *ptr, long len)
1017{
1018 return str_new_static(rb_cString, ptr, len, 0);
1019}
1020
1021VALUE
1023{
1024 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1025}
1026
1027VALUE
1029{
1030 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1031}
1032
1033VALUE
1035{
1036 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1037}
1038
1039VALUE
1040rb_tainted_str_new(const char *ptr, long len)
1041{
1042 rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new", NULL);
1043 return rb_str_new(ptr, len);
1044}
1045
1046VALUE
1048{
1049 rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new_cstr", NULL);
1050 return rb_str_new_cstr(ptr);
1051}
1052
1053static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1054 rb_encoding *from, rb_encoding *to,
1055 int ecflags, VALUE ecopts);
1056
1057static inline bool
1058is_enc_ascii_string(VALUE str, rb_encoding *enc)
1059{
1060 int encidx = rb_enc_to_index(enc);
1061 if (rb_enc_get_index(str) == encidx)
1062 return is_ascii_string(str);
1063 return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
1064}
1065
1066VALUE
1067rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1068{
1069 long len;
1070 const char *ptr;
1071 VALUE newstr;
1072
1073 if (!to) return str;
1074 if (!from) from = rb_enc_get(str);
1075 if (from == to) return str;
1076 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1077 to == rb_ascii8bit_encoding()) {
1078 if (STR_ENC_GET(str) != to) {
1079 str = rb_str_dup(str);
1080 rb_enc_associate(str, to);
1081 }
1082 return str;
1083 }
1084
1085 RSTRING_GETMEM(str, ptr, len);
1086 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1087 from, to, ecflags, ecopts);
1088 if (NIL_P(newstr)) {
1089 /* some error, return original */
1090 return str;
1091 }
1092 return newstr;
1093}
1094
1095VALUE
1096rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1097 rb_encoding *from, int ecflags, VALUE ecopts)
1098{
1099 long olen;
1100
1101 olen = RSTRING_LEN(newstr);
1102 if (ofs < -olen || olen < ofs)
1103 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1104 if (ofs < 0) ofs += olen;
1105 if (!from) {
1106 STR_SET_LEN(newstr, ofs);
1107 return rb_str_cat(newstr, ptr, len);
1108 }
1109
1110 rb_str_modify(newstr);
1111 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1112 rb_enc_get(newstr),
1113 ecflags, ecopts);
1114}
1115
1116VALUE
1117rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1118{
1119 STR_SET_LEN(str, 0);
1120 rb_enc_associate(str, enc);
1121 rb_str_cat(str, ptr, len);
1122 return str;
1123}
1124
1125static VALUE
1126str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1127 rb_encoding *from, rb_encoding *to,
1128 int ecflags, VALUE ecopts)
1129{
1130 rb_econv_t *ec;
1132 long olen;
1133 VALUE econv_wrapper;
1134 const unsigned char *start, *sp;
1135 unsigned char *dest, *dp;
1136 size_t converted_output = (size_t)ofs;
1137
1138 olen = rb_str_capacity(newstr);
1139
1140 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1141 RBASIC_CLEAR_CLASS(econv_wrapper);
1142 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1143 if (!ec) return Qnil;
1144 DATA_PTR(econv_wrapper) = ec;
1145
1146 sp = (unsigned char*)ptr;
1147 start = sp;
1148 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1149 (dp = dest + converted_output),
1150 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1152 /* destination buffer short */
1153 size_t converted_input = sp - start;
1154 size_t rest = len - converted_input;
1155 converted_output = dp - dest;
1156 rb_str_set_len(newstr, converted_output);
1157 if (converted_input && converted_output &&
1158 rest < (LONG_MAX / converted_output)) {
1159 rest = (rest * converted_output) / converted_input;
1160 }
1161 else {
1162 rest = olen;
1163 }
1164 olen += rest < 2 ? 2 : rest;
1165 rb_str_resize(newstr, olen);
1166 }
1167 DATA_PTR(econv_wrapper) = 0;
1168 rb_econv_close(ec);
1169 switch (ret) {
1170 case econv_finished:
1171 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1172 rb_str_set_len(newstr, len);
1173 rb_enc_associate(newstr, to);
1174 return newstr;
1175
1176 default:
1177 return Qnil;
1178 }
1179}
1180
1181VALUE
1183{
1184 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1185}
1186
1187VALUE
1189{
1190 rb_encoding *ienc;
1191 VALUE str;
1192 const int eidx = rb_enc_to_index(eenc);
1193
1194 if (!ptr) {
1195 return rb_enc_str_new(ptr, len, eenc);
1196 }
1197
1198 /* ASCII-8BIT case, no conversion */
1199 if ((eidx == rb_ascii8bit_encindex()) ||
1200 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1201 return rb_str_new(ptr, len);
1202 }
1203 /* no default_internal or same encoding, no conversion */
1204 ienc = rb_default_internal_encoding();
1205 if (!ienc || eenc == ienc) {
1206 return rb_enc_str_new(ptr, len, eenc);
1207 }
1208 /* ASCII compatible, and ASCII only string, no conversion in
1209 * default_internal */
1210 if ((eidx == rb_ascii8bit_encindex()) ||
1211 (eidx == rb_usascii_encindex()) ||
1212 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1213 return rb_enc_str_new(ptr, len, ienc);
1214 }
1215 /* convert from the given encoding to default_internal */
1216 str = rb_enc_str_new(NULL, 0, ienc);
1217 /* when the conversion failed for some reason, just ignore the
1218 * default_internal and result in the given encoding as-is. */
1219 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1220 rb_str_initialize(str, ptr, len, eenc);
1221 }
1222 return str;
1223}
1224
1225VALUE
1226rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1227{
1228 int eidx = rb_enc_to_index(eenc);
1229 if (eidx == rb_usascii_encindex() &&
1231 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1232 return str;
1233 }
1234 rb_enc_associate_index(str, eidx);
1235 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1236}
1237
1238VALUE
1239rb_external_str_new(const char *ptr, long len)
1240{
1241 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1242}
1243
1244VALUE
1246{
1247 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1248}
1249
1250VALUE
1251rb_locale_str_new(const char *ptr, long len)
1252{
1253 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1254}
1255
1256VALUE
1258{
1259 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1260}
1261
1262VALUE
1264{
1265 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1266}
1267
1268VALUE
1270{
1271 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1272}
1273
1274VALUE
1276{
1277 return rb_str_export_to_enc(str, rb_default_external_encoding());
1278}
1279
1280VALUE
1282{
1283 return rb_str_export_to_enc(str, rb_locale_encoding());
1284}
1285
1286VALUE
1288{
1289 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1290}
1291
1292static VALUE
1293str_replace_shared_without_enc(VALUE str2, VALUE str)
1294{
1295 const int termlen = TERM_LEN(str);
1296 char *ptr;
1297 long len;
1298
1299 RSTRING_GETMEM(str, ptr, len);
1300 if (str_embed_capa(str2) >= len + termlen) {
1301 char *ptr2 = RSTRING(str2)->as.embed.ary;
1302 STR_SET_EMBED(str2);
1303 memcpy(ptr2, RSTRING_PTR(str), len);
1304 STR_SET_EMBED_LEN(str2, len);
1305 TERM_FILL(ptr2+len, termlen);
1306 }
1307 else {
1308 VALUE root;
1309 if (STR_SHARED_P(str)) {
1310 root = RSTRING(str)->as.heap.aux.shared;
1311 RSTRING_GETMEM(str, ptr, len);
1312 }
1313 else {
1314 root = rb_str_new_frozen(str);
1315 RSTRING_GETMEM(root, ptr, len);
1316 }
1317 assert(OBJ_FROZEN(root));
1318 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1319 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1320 rb_fatal("about to free a possible shared root");
1321 }
1322 char *ptr2 = STR_HEAP_PTR(str2);
1323 if (ptr2 != ptr) {
1324 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1325 }
1326 }
1327 FL_SET(str2, STR_NOEMBED);
1328 RSTRING(str2)->as.heap.len = len;
1329 RSTRING(str2)->as.heap.ptr = ptr;
1330 STR_SET_SHARED(str2, root);
1331 }
1332 return str2;
1333}
1334
1335static VALUE
1336str_replace_shared(VALUE str2, VALUE str)
1337{
1338 str_replace_shared_without_enc(str2, str);
1339 rb_enc_cr_str_exact_copy(str2, str);
1340 return str2;
1341}
1342
1343static VALUE
1344str_new_shared(VALUE klass, VALUE str)
1345{
1346 return str_replace_shared(str_alloc_heap(klass), str);
1347}
1348
1349VALUE
1351{
1352 return str_new_shared(rb_obj_class(str), str);
1353}
1354
1355VALUE
1357{
1358 if (OBJ_FROZEN(orig)) return orig;
1359 return str_new_frozen(rb_obj_class(orig), orig);
1360}
1361
1362static VALUE
1363rb_str_new_frozen_String(VALUE orig)
1364{
1365 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1366 return str_new_frozen(rb_cString, orig);
1367}
1368
1369VALUE
1370rb_str_tmp_frozen_acquire(VALUE orig)
1371{
1372 if (OBJ_FROZEN_RAW(orig)) return orig;
1373 return str_new_frozen_buffer(0, orig, FALSE);
1374}
1375
1376void
1377rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1378{
1379 if (RBASIC_CLASS(tmp) != 0)
1380 return;
1381
1382 if (STR_EMBED_P(tmp)) {
1383 assert(OBJ_FROZEN_RAW(tmp));
1384 }
1385 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1386 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1387 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1388
1389 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1390 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1391 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1392
1393 /* Unshare orig since the root (tmp) only has this one child. */
1394 FL_UNSET_RAW(orig, STR_SHARED);
1395 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1396 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1397 assert(OBJ_FROZEN_RAW(tmp));
1398
1399 /* Make tmp embedded and empty so it is safe for sweeping. */
1400 STR_SET_EMBED(tmp);
1401 STR_SET_EMBED_LEN(tmp, 0);
1402 }
1403 }
1404}
1405
1406static VALUE
1407str_new_frozen(VALUE klass, VALUE orig)
1408{
1409 return str_new_frozen_buffer(klass, orig, TRUE);
1410}
1411
1412static VALUE
1413heap_str_make_shared(VALUE klass, VALUE orig)
1414{
1415 assert(!STR_EMBED_P(orig));
1416 assert(!STR_SHARED_P(orig));
1417
1418 VALUE str = str_alloc_heap(klass);
1419 STR_SET_NOEMBED(str);
1420 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1421 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1422 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1423 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1424 RBASIC(orig)->flags &= ~STR_NOFREE;
1425 STR_SET_SHARED(orig, str);
1426 if (klass == 0)
1427 FL_UNSET_RAW(str, STR_BORROWED);
1428 return str;
1429}
1430
1431static VALUE
1432str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1433{
1434 VALUE str;
1435
1436 long len = RSTRING_LEN(orig);
1437
1438 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, 1)) {
1439 str = str_new(klass, RSTRING_PTR(orig), len);
1440 assert(STR_EMBED_P(str));
1441 }
1442 else {
1443 if (FL_TEST_RAW(orig, STR_SHARED)) {
1444 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1445 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1446 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1447 assert(ofs >= 0);
1448 assert(rest >= 0);
1449 assert(ofs + rest <= RSTRING_LEN(shared));
1450#if !USE_RVARGC
1451 assert(!STR_EMBED_P(shared));
1452#endif
1453 assert(OBJ_FROZEN(shared));
1454
1455 if ((ofs > 0) || (rest > 0) ||
1456 (klass != RBASIC(shared)->klass) ||
1457 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1458 str = str_new_shared(klass, shared);
1459 assert(!STR_EMBED_P(str));
1460 RSTRING(str)->as.heap.ptr += ofs;
1461 RSTRING(str)->as.heap.len -= ofs + rest;
1462 }
1463 else {
1464 if (RBASIC_CLASS(shared) == 0)
1465 FL_SET_RAW(shared, STR_BORROWED);
1466 return shared;
1467 }
1468 }
1469 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1470 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1471 STR_SET_EMBED(str);
1472 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1473 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1474 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1475 }
1476 else {
1477 str = heap_str_make_shared(klass, orig);
1478 }
1479 }
1480
1481 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1482 OBJ_FREEZE(str);
1483 return str;
1484}
1485
1486VALUE
1487rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1488{
1489 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1490}
1491
1492static VALUE
1493str_new_empty_String(VALUE str)
1494{
1495 VALUE v = rb_str_new(0, 0);
1496 rb_enc_copy(v, str);
1497 return v;
1498}
1499
1500#define STR_BUF_MIN_SIZE 63
1501#if !USE_RVARGC
1502STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1503#endif
1504
1505VALUE
1507{
1508 if (STR_EMBEDDABLE_P(capa, 1)) {
1509 return str_alloc_embed(rb_cString, capa + 1);
1510 }
1511
1512 VALUE str = str_alloc_heap(rb_cString);
1513
1514#if !USE_RVARGC
1515 if (capa < STR_BUF_MIN_SIZE) {
1516 capa = STR_BUF_MIN_SIZE;
1517 }
1518#endif
1519 FL_SET(str, STR_NOEMBED);
1520 RSTRING(str)->as.heap.aux.capa = capa;
1521 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1522 RSTRING(str)->as.heap.ptr[0] = '\0';
1523
1524 return str;
1525}
1526
1527VALUE
1529{
1530 VALUE str;
1531 long len = strlen(ptr);
1532
1533 str = rb_str_buf_new(len);
1534 rb_str_buf_cat(str, ptr, len);
1535
1536 return str;
1537}
1538
1539VALUE
1541{
1542 return str_new(0, 0, len);
1543}
1544
1545void
1547{
1548 if (FL_TEST(str, RSTRING_FSTR)) {
1549 st_data_t fstr = (st_data_t)str;
1550
1551 RB_VM_LOCK_ENTER();
1552 {
1553 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1554 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1555 }
1556 RB_VM_LOCK_LEAVE();
1557 }
1558
1559 if (STR_EMBED_P(str)) {
1560 RB_DEBUG_COUNTER_INC(obj_str_embed);
1561 }
1562 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1563 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1564 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1565 }
1566 else {
1567 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1568 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1569 }
1570}
1571
1572RUBY_FUNC_EXPORTED size_t
1573rb_str_memsize(VALUE str)
1574{
1575 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1576 return STR_HEAP_SIZE(str);
1577 }
1578 else {
1579 return 0;
1580 }
1581}
1582
1583VALUE
1585{
1586 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1587}
1588
1589static inline void str_discard(VALUE str);
1590static void str_shared_replace(VALUE str, VALUE str2);
1591
1592void
1594{
1595 if (str != str2) str_shared_replace(str, str2);
1596}
1597
1598static void
1599str_shared_replace(VALUE str, VALUE str2)
1600{
1601 rb_encoding *enc;
1602 int cr;
1603 int termlen;
1604
1605 RUBY_ASSERT(str2 != str);
1606 enc = STR_ENC_GET(str2);
1607 cr = ENC_CODERANGE(str2);
1608 str_discard(str);
1609 termlen = rb_enc_mbminlen(enc);
1610
1611 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1612 STR_SET_EMBED(str);
1613 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1614 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1615 rb_enc_associate(str, enc);
1616 ENC_CODERANGE_SET(str, cr);
1617 }
1618 else {
1619#if USE_RVARGC
1620 if (STR_EMBED_P(str2)) {
1621 assert(!FL_TEST(str2, STR_SHARED));
1622 long len = RSTRING(str2)->as.embed.len;
1623 assert(len + termlen <= str_embed_capa(str2));
1624
1625 char *new_ptr = ALLOC_N(char, len + termlen);
1626 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1627 RSTRING(str2)->as.heap.ptr = new_ptr;
1628 RSTRING(str2)->as.heap.len = len;
1629 RSTRING(str2)->as.heap.aux.capa = len;
1630 STR_SET_NOEMBED(str2);
1631 }
1632#endif
1633
1634 STR_SET_NOEMBED(str);
1635 FL_UNSET(str, STR_SHARED);
1636 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1637 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1638
1639 if (FL_TEST(str2, STR_SHARED)) {
1640 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1641 STR_SET_SHARED(str, shared);
1642 }
1643 else {
1644 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1645 }
1646
1647 /* abandon str2 */
1648 STR_SET_EMBED(str2);
1649 RSTRING_PTR(str2)[0] = 0;
1650 STR_SET_EMBED_LEN(str2, 0);
1651 rb_enc_associate(str, enc);
1652 ENC_CODERANGE_SET(str, cr);
1653 }
1654}
1655
1656VALUE
1658{
1659 VALUE str;
1660
1661 if (RB_TYPE_P(obj, T_STRING)) {
1662 return obj;
1663 }
1664 str = rb_funcall(obj, idTo_s, 0);
1665 return rb_obj_as_string_result(str, obj);
1666}
1667
1668MJIT_FUNC_EXPORTED VALUE
1669rb_obj_as_string_result(VALUE str, VALUE obj)
1670{
1671 if (!RB_TYPE_P(str, T_STRING))
1672 return rb_any_to_s(obj);
1673 return str;
1674}
1675
1676static VALUE
1677str_replace(VALUE str, VALUE str2)
1678{
1679 long len;
1680
1681 len = RSTRING_LEN(str2);
1682 if (STR_SHARED_P(str2)) {
1683 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1684 assert(OBJ_FROZEN(shared));
1685 STR_SET_NOEMBED(str);
1686 RSTRING(str)->as.heap.len = len;
1687 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1688 STR_SET_SHARED(str, shared);
1689 rb_enc_cr_str_exact_copy(str, str2);
1690 }
1691 else {
1692 str_replace_shared(str, str2);
1693 }
1694
1695 return str;
1696}
1697
1698static inline VALUE
1699ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass, size_t size)
1700{
1701 assert(size > 0);
1702 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1704 return (VALUE)str;
1705}
1706
1707static inline VALUE
1708ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1709{
1710 size_t size = str_embed_size(capa);
1711 assert(rb_gc_size_allocatable_p(size));
1712#if !USE_RVARGC
1713 assert(size <= sizeof(struct RString));
1714#endif
1715 return ec_str_alloc(ec, klass, size);
1716}
1717
1718static inline VALUE
1719ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1720{
1721 return ec_str_alloc(ec, klass, sizeof(struct RString));
1722}
1723
1724static inline VALUE
1725str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1726{
1727 const VALUE flag_mask =
1728#if !USE_RVARGC
1729 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1730#endif
1732 FL_FREEZE
1733 ;
1734 VALUE flags = FL_TEST_RAW(str, flag_mask);
1735 int encidx = 0;
1736 if (STR_EMBED_P(str)) {
1737 long len = RSTRING_EMBED_LEN(str);
1738
1739 assert(str_embed_capa(dup) >= len + 1);
1740 STR_SET_EMBED_LEN(dup, len);
1741 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1742 }
1743 else {
1744 VALUE root = str;
1745 if (FL_TEST_RAW(str, STR_SHARED)) {
1746 root = RSTRING(str)->as.heap.aux.shared;
1747 }
1748 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1749 root = str = str_new_frozen(klass, str);
1750 flags = FL_TEST_RAW(str, flag_mask);
1751 }
1752 assert(!STR_SHARED_P(root));
1753 assert(RB_OBJ_FROZEN_RAW(root));
1754#if USE_RVARGC
1755 if (1) {
1756#else
1757 if (STR_EMBED_P(root)) {
1758 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1759 char, RSTRING_EMBED_LEN_MAX + 1);
1760 }
1761 else {
1762#endif
1763 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1764 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1765 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1766 flags |= RSTRING_NOEMBED | STR_SHARED;
1767 }
1768 }
1769
1770 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1771 encidx = rb_enc_get_index(str);
1772 flags &= ~ENCODING_MASK;
1773 }
1774 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1775 if (encidx) rb_enc_associate_index(dup, encidx);
1776 return dup;
1777}
1778
1779static inline VALUE
1780ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1781{
1782 VALUE dup;
1783 if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1784 dup = ec_str_alloc_heap(ec, klass);
1785 }
1786 else {
1787 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1788 }
1789
1790 return str_duplicate_setup(klass, str, dup);
1791}
1792
1793static inline VALUE
1794str_duplicate(VALUE klass, VALUE str)
1795{
1796 VALUE dup;
1797 if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1798 dup = str_alloc_heap(klass);
1799 }
1800 else {
1801 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1802 }
1803
1804 return str_duplicate_setup(klass, str, dup);
1805}
1806
1807VALUE
1809{
1810 return str_duplicate(rb_obj_class(str), str);
1811}
1812
1813VALUE
1815{
1816 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1817 return str_duplicate(rb_cString, str);
1818}
1819
1820VALUE
1821rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1822{
1823 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1824 return ec_str_duplicate(ec, rb_cString, str);
1825}
1826
1827/*
1828 * call-seq:
1829 * String.new(string = '') -> new_string
1830 * String.new(string = '', encoding: encoding) -> new_string
1831 * String.new(string = '', capacity: size) -> new_string
1832 *
1833 * Returns a new \String that is a copy of +string+.
1834 *
1835 * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1836 * s = String.new
1837 * s # => ""
1838 * s.encoding # => #<Encoding:ASCII-8BIT>
1839 *
1840 * With the single \String argument +string+, returns a copy of +string+
1841 * with the same encoding as +string+:
1842 * s = String.new("Que veut dire \u{e7}a?")
1843 * s # => "Que veut dire \u{e7}a?"
1844 * s.encoding # => #<Encoding:UTF-8>
1845 *
1846 * Literal strings like <tt>""</tt> or here-documents always use
1847 * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1848 *
1849 * With keyword +encoding+, returns a copy of +str+
1850 * with the specified encoding:
1851 * s = String.new(encoding: 'ASCII')
1852 * s.encoding # => #<Encoding:US-ASCII>
1853 * s = String.new('foo', encoding: 'ASCII')
1854 * s.encoding # => #<Encoding:US-ASCII>
1855 *
1856 * Note that these are equivalent:
1857 * s0 = String.new('foo', encoding: 'ASCII')
1858 * s1 = 'foo'.force_encoding('ASCII')
1859 * s0.encoding == s1.encoding # => true
1860 *
1861 * With keyword +capacity+, returns a copy of +str+;
1862 * the given +capacity+ may set the size of the internal buffer,
1863 * which may affect performance:
1864 * String.new(capacity: 1) # => ""
1865 * String.new(capacity: 4096) # => ""
1866 *
1867 * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1868 *
1869 * String.new('hello', encoding: 'UTF-8', capacity: 25)
1870 *
1871 */
1872
1873static VALUE
1874rb_str_init(int argc, VALUE *argv, VALUE str)
1875{
1876 static ID keyword_ids[2];
1877 VALUE orig, opt, venc, vcapa;
1878 VALUE kwargs[2];
1879 rb_encoding *enc = 0;
1880 int n;
1881
1882 if (!keyword_ids[0]) {
1883 keyword_ids[0] = rb_id_encoding();
1884 CONST_ID(keyword_ids[1], "capacity");
1885 }
1886
1887 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1888 if (!NIL_P(opt)) {
1889 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1890 venc = kwargs[0];
1891 vcapa = kwargs[1];
1892 if (venc != Qundef && !NIL_P(venc)) {
1893 enc = rb_to_encoding(venc);
1894 }
1895 if (vcapa != Qundef && !NIL_P(vcapa)) {
1896 long capa = NUM2LONG(vcapa);
1897 long len = 0;
1898 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1899
1900 if (capa < STR_BUF_MIN_SIZE) {
1901 capa = STR_BUF_MIN_SIZE;
1902 }
1903 if (n == 1) {
1904 StringValue(orig);
1905 len = RSTRING_LEN(orig);
1906 if (capa < len) {
1907 capa = len;
1908 }
1909 if (orig == str) n = 0;
1910 }
1911 str_modifiable(str);
1912 if (STR_EMBED_P(str)) { /* make noembed always */
1913 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1914#if USE_RVARGC
1915 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1916 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1917#else
1918 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1919#endif
1920 RSTRING(str)->as.heap.ptr = new_ptr;
1921 }
1922 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1923 const size_t size = (size_t)capa + termlen;
1924 const char *const old_ptr = RSTRING_PTR(str);
1925 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1926 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1927 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1928 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1929 RSTRING(str)->as.heap.ptr = new_ptr;
1930 }
1931 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1932 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1933 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1934 }
1935 RSTRING(str)->as.heap.len = len;
1936 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1937 if (n == 1) {
1938 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1939 rb_enc_cr_str_exact_copy(str, orig);
1940 }
1941 FL_SET(str, STR_NOEMBED);
1942 RSTRING(str)->as.heap.aux.capa = capa;
1943 }
1944 else if (n == 1) {
1945 rb_str_replace(str, orig);
1946 }
1947 if (enc) {
1948 rb_enc_associate(str, enc);
1950 }
1951 }
1952 else if (n == 1) {
1953 rb_str_replace(str, orig);
1954 }
1955 return str;
1956}
1957
1958#ifdef NONASCII_MASK
1959#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1960
1961/*
1962 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1963 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1964 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1965 *
1966 * if (!(byte & 0x80))
1967 * byte |= 0x40; // turn on bit6
1968 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1969 *
1970 * This function calculates whether a byte is leading or not for all bytes
1971 * in the argument word by concurrently using the above logic, and then
1972 * adds up the number of leading bytes in the word.
1973 */
1974static inline uintptr_t
1975count_utf8_lead_bytes_with_word(const uintptr_t *s)
1976{
1977 uintptr_t d = *s;
1978
1979 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1980 d = (d>>6) | (~d>>7);
1981 d &= NONASCII_MASK >> 7;
1982
1983 /* Gather all bytes. */
1984#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1985 /* use only if it can use POPCNT */
1986 return rb_popcount_intptr(d);
1987#else
1988 d += (d>>8);
1989 d += (d>>16);
1990# if SIZEOF_VOIDP == 8
1991 d += (d>>32);
1992# endif
1993 return (d&0xF);
1994#endif
1995}
1996#endif
1997
1998static inline long
1999enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2000{
2001 long c;
2002 const char *q;
2003
2004 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2005 long diff = (long)(e - p);
2006 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2007 }
2008#ifdef NONASCII_MASK
2009 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2010 uintptr_t len = 0;
2011 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2012 const uintptr_t *s, *t;
2013 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2014 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2015 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2016 while (p < (const char *)s) {
2017 if (is_utf8_lead_byte(*p)) len++;
2018 p++;
2019 }
2020 while (s < t) {
2021 len += count_utf8_lead_bytes_with_word(s);
2022 s++;
2023 }
2024 p = (const char *)s;
2025 }
2026 while (p < e) {
2027 if (is_utf8_lead_byte(*p)) len++;
2028 p++;
2029 }
2030 return (long)len;
2031 }
2032#endif
2033 else if (rb_enc_asciicompat(enc)) {
2034 c = 0;
2035 if (ENC_CODERANGE_CLEAN_P(cr)) {
2036 while (p < e) {
2037 if (ISASCII(*p)) {
2038 q = search_nonascii(p, e);
2039 if (!q)
2040 return c + (e - p);
2041 c += q - p;
2042 p = q;
2043 }
2044 p += rb_enc_fast_mbclen(p, e, enc);
2045 c++;
2046 }
2047 }
2048 else {
2049 while (p < e) {
2050 if (ISASCII(*p)) {
2051 q = search_nonascii(p, e);
2052 if (!q)
2053 return c + (e - p);
2054 c += q - p;
2055 p = q;
2056 }
2057 p += rb_enc_mbclen(p, e, enc);
2058 c++;
2059 }
2060 }
2061 return c;
2062 }
2063
2064 for (c=0; p<e; c++) {
2065 p += rb_enc_mbclen(p, e, enc);
2066 }
2067 return c;
2068}
2069
2070long
2071rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2072{
2073 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2074}
2075
2076/* To get strlen with cr
2077 * Note that given cr is not used.
2078 */
2079long
2080rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2081{
2082 long c;
2083 const char *q;
2084 int ret;
2085
2086 *cr = 0;
2087 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2088 long diff = (long)(e - p);
2089 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2090 }
2091 else if (rb_enc_asciicompat(enc)) {
2092 c = 0;
2093 while (p < e) {
2094 if (ISASCII(*p)) {
2095 q = search_nonascii(p, e);
2096 if (!q) {
2097 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2098 return c + (e - p);
2099 }
2100 c += q - p;
2101 p = q;
2102 }
2103 ret = rb_enc_precise_mbclen(p, e, enc);
2104 if (MBCLEN_CHARFOUND_P(ret)) {
2105 *cr |= ENC_CODERANGE_VALID;
2106 p += MBCLEN_CHARFOUND_LEN(ret);
2107 }
2108 else {
2110 p++;
2111 }
2112 c++;
2113 }
2114 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2115 return c;
2116 }
2117
2118 for (c=0; p<e; c++) {
2119 ret = rb_enc_precise_mbclen(p, e, enc);
2120 if (MBCLEN_CHARFOUND_P(ret)) {
2121 *cr |= ENC_CODERANGE_VALID;
2122 p += MBCLEN_CHARFOUND_LEN(ret);
2123 }
2124 else {
2126 if (p + rb_enc_mbminlen(enc) <= e)
2127 p += rb_enc_mbminlen(enc);
2128 else
2129 p = e;
2130 }
2131 }
2132 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2133 return c;
2134}
2135
2136/* enc must be str's enc or rb_enc_check(str, str2) */
2137static long
2138str_strlen(VALUE str, rb_encoding *enc)
2139{
2140 const char *p, *e;
2141 int cr;
2142
2143 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2144 if (!enc) enc = STR_ENC_GET(str);
2145 p = RSTRING_PTR(str);
2146 e = RSTRING_END(str);
2147 cr = ENC_CODERANGE(str);
2148
2149 if (cr == ENC_CODERANGE_UNKNOWN) {
2150 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2151 if (cr) ENC_CODERANGE_SET(str, cr);
2152 return n;
2153 }
2154 else {
2155 return enc_strlen(p, e, enc, cr);
2156 }
2157}
2158
2159long
2161{
2162 return str_strlen(str, NULL);
2163}
2164
2165/*
2166 * call-seq:
2167 * length -> integer
2168 *
2169 * Returns the count of characters (not bytes) in +self+:
2170 *
2171 * "\x80\u3042".length # => 2
2172 * "hello".length # => 5
2173 *
2174 * String#size is an alias for String#length.
2175 *
2176 * Related: String#bytesize.
2177 */
2178
2179VALUE
2181{
2182 return LONG2NUM(str_strlen(str, NULL));
2183}
2184
2185/*
2186 * call-seq:
2187 * bytesize -> integer
2188 *
2189 * Returns the count of bytes in +self+:
2190 *
2191 * "\x80\u3042".bytesize # => 4
2192 * "hello".bytesize # => 5
2193 *
2194 * Related: String#length.
2195 */
2196
2197static VALUE
2198rb_str_bytesize(VALUE str)
2199{
2200 return LONG2NUM(RSTRING_LEN(str));
2201}
2202
2203/*
2204 * call-seq:
2205 * empty? -> true or false
2206 *
2207 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2208 *
2209 * "hello".empty? # => false
2210 * " ".empty? # => false
2211 * "".empty? # => true
2212 *
2213 */
2214
2215static VALUE
2216rb_str_empty(VALUE str)
2217{
2218 return RBOOL(RSTRING_LEN(str) == 0);
2219}
2220
2221/*
2222 * call-seq:
2223 * string + other_string -> new_string
2224 *
2225 * Returns a new \String containing +other_string+ concatenated to +self+:
2226 *
2227 * "Hello from " + self.to_s # => "Hello from main"
2228 *
2229 */
2230
2231VALUE
2233{
2234 VALUE str3;
2235 rb_encoding *enc;
2236 char *ptr1, *ptr2, *ptr3;
2237 long len1, len2;
2238 int termlen;
2239
2240 StringValue(str2);
2241 enc = rb_enc_check_str(str1, str2);
2242 RSTRING_GETMEM(str1, ptr1, len1);
2243 RSTRING_GETMEM(str2, ptr2, len2);
2244 termlen = rb_enc_mbminlen(enc);
2245 if (len1 > LONG_MAX - len2) {
2246 rb_raise(rb_eArgError, "string size too big");
2247 }
2248 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2249 ptr3 = RSTRING_PTR(str3);
2250 memcpy(ptr3, ptr1, len1);
2251 memcpy(ptr3+len1, ptr2, len2);
2252 TERM_FILL(&ptr3[len1+len2], termlen);
2253
2254 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2256 RB_GC_GUARD(str1);
2257 RB_GC_GUARD(str2);
2258 return str3;
2259}
2260
2261/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2262MJIT_FUNC_EXPORTED VALUE
2263rb_str_opt_plus(VALUE str1, VALUE str2)
2264{
2265 assert(RBASIC_CLASS(str1) == rb_cString);
2266 assert(RBASIC_CLASS(str2) == rb_cString);
2267 long len1, len2;
2268 MAYBE_UNUSED(char) *ptr1, *ptr2;
2269 RSTRING_GETMEM(str1, ptr1, len1);
2270 RSTRING_GETMEM(str2, ptr2, len2);
2271 int enc1 = rb_enc_get_index(str1);
2272 int enc2 = rb_enc_get_index(str2);
2273
2274 if (enc1 < 0) {
2275 return Qundef;
2276 }
2277 else if (enc2 < 0) {
2278 return Qundef;
2279 }
2280 else if (enc1 != enc2) {
2281 return Qundef;
2282 }
2283 else if (len1 > LONG_MAX - len2) {
2284 return Qundef;
2285 }
2286 else {
2287 return rb_str_plus(str1, str2);
2288 }
2289
2290}
2291
2292/*
2293 * call-seq:
2294 * string * integer -> new_string
2295 *
2296 * Returns a new \String containing +integer+ copies of +self+:
2297 *
2298 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2299 * "Ho! " * 0 # => ""
2300 *
2301 */
2302
2303VALUE
2305{
2306 VALUE str2;
2307 long n, len;
2308 char *ptr2;
2309 int termlen;
2310
2311 if (times == INT2FIX(1)) {
2312 return str_duplicate(rb_cString, str);
2313 }
2314 if (times == INT2FIX(0)) {
2315 str2 = str_alloc_embed(rb_cString, 0);
2316 rb_enc_copy(str2, str);
2317 return str2;
2318 }
2319 len = NUM2LONG(times);
2320 if (len < 0) {
2321 rb_raise(rb_eArgError, "negative argument");
2322 }
2323 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2324 if (STR_EMBEDDABLE_P(len, 1)) {
2325 str2 = str_alloc_embed(rb_cString, len + 1);
2326 memset(RSTRING_PTR(str2), 0, len + 1);
2327 }
2328 else {
2329 str2 = str_alloc_heap(rb_cString);
2330 RSTRING(str2)->as.heap.aux.capa = len;
2331 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2332 STR_SET_NOEMBED(str2);
2333 }
2334 STR_SET_LEN(str2, len);
2335 rb_enc_copy(str2, str);
2336 return str2;
2337 }
2338 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2339 rb_raise(rb_eArgError, "argument too big");
2340 }
2341
2342 len *= RSTRING_LEN(str);
2343 termlen = TERM_LEN(str);
2344 str2 = str_new0(rb_cString, 0, len, termlen);
2345 ptr2 = RSTRING_PTR(str2);
2346 if (len) {
2347 n = RSTRING_LEN(str);
2348 memcpy(ptr2, RSTRING_PTR(str), n);
2349 while (n <= len/2) {
2350 memcpy(ptr2 + n, ptr2, n);
2351 n *= 2;
2352 }
2353 memcpy(ptr2 + n, ptr2, len-n);
2354 }
2355 STR_SET_LEN(str2, len);
2356 TERM_FILL(&ptr2[len], termlen);
2357 rb_enc_cr_str_copy_for_substr(str2, str);
2358
2359 return str2;
2360}
2361
2362/*
2363 * call-seq:
2364 * string % object -> new_string
2365 *
2366 * Returns the result of formatting +object+ into the format specification +self+
2367 * (see Kernel#sprintf for formatting details):
2368 *
2369 * "%05d" % 123 # => "00123"
2370 *
2371 * If +self+ contains multiple substitutions, +object+ must be
2372 * an \Array or \Hash containing the values to be substituted:
2373 *
2374 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2375 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2376 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2377 *
2378 */
2379
2380static VALUE
2381rb_str_format_m(VALUE str, VALUE arg)
2382{
2383 VALUE tmp = rb_check_array_type(arg);
2384
2385 if (!NIL_P(tmp)) {
2386 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2387 }
2388 return rb_str_format(1, &arg, str);
2389}
2390
2391static inline void
2392rb_check_lockedtmp(VALUE str)
2393{
2394 if (FL_TEST(str, STR_TMPLOCK)) {
2395 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2396 }
2397}
2398
2399static inline void
2400str_modifiable(VALUE str)
2401{
2402 rb_check_lockedtmp(str);
2403 rb_check_frozen(str);
2404}
2405
2406static inline int
2407str_dependent_p(VALUE str)
2408{
2409 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2410 return 0;
2411 }
2412 else {
2413 return 1;
2414 }
2415}
2416
2417static inline int
2418str_independent(VALUE str)
2419{
2420 str_modifiable(str);
2421 return !str_dependent_p(str);
2422}
2423
2424static void
2425str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2426{
2427 char *ptr;
2428 char *oldptr;
2429 long capa = len + expand;
2430
2431 if (len > capa) len = capa;
2432
2433 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2434 ptr = RSTRING(str)->as.heap.ptr;
2435 STR_SET_EMBED(str);
2436 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2437 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2438 STR_SET_EMBED_LEN(str, len);
2439 return;
2440 }
2441
2442 ptr = ALLOC_N(char, (size_t)capa + termlen);
2443 oldptr = RSTRING_PTR(str);
2444 if (oldptr) {
2445 memcpy(ptr, oldptr, len);
2446 }
2447 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2448 xfree(oldptr);
2449 }
2450 STR_SET_NOEMBED(str);
2451 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2452 TERM_FILL(ptr + len, termlen);
2453 RSTRING(str)->as.heap.ptr = ptr;
2454 RSTRING(str)->as.heap.len = len;
2455 RSTRING(str)->as.heap.aux.capa = capa;
2456}
2457
2458void
2460{
2461 if (!str_independent(str))
2462 str_make_independent(str);
2464}
2465
2466void
2468{
2469 int termlen = TERM_LEN(str);
2470 long len = RSTRING_LEN(str);
2471
2472 if (expand < 0) {
2473 rb_raise(rb_eArgError, "negative expanding string size");
2474 }
2475 if (expand >= LONG_MAX - len) {
2476 rb_raise(rb_eArgError, "string size too big");
2477 }
2478
2479 if (!str_independent(str)) {
2480 str_make_independent_expand(str, len, expand, termlen);
2481 }
2482 else if (expand > 0) {
2483 RESIZE_CAPA_TERM(str, len + expand, termlen);
2484 }
2486}
2487
2488/* As rb_str_modify(), but don't clear coderange */
2489static void
2490str_modify_keep_cr(VALUE str)
2491{
2492 if (!str_independent(str))
2493 str_make_independent(str);
2495 /* Force re-scan later */
2497}
2498
2499static inline void
2500str_discard(VALUE str)
2501{
2502 str_modifiable(str);
2503 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2504 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2505 RSTRING(str)->as.heap.ptr = 0;
2506 RSTRING(str)->as.heap.len = 0;
2507 }
2508}
2509
2510void
2512{
2513 rb_encoding *enc = rb_enc_get(str);
2514 if (!rb_enc_asciicompat(enc)) {
2515 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2516 }
2517}
2518
2519VALUE
2521{
2522 VALUE s = *ptr;
2523 if (!RB_TYPE_P(s, T_STRING)) {
2524 s = rb_str_to_str(s);
2525 *ptr = s;
2526 }
2527 return s;
2528}
2529
2530char *
2532{
2533 VALUE str = rb_string_value(ptr);
2534 return RSTRING_PTR(str);
2535}
2536
2537static int
2538zero_filled(const char *s, int n)
2539{
2540 for (; n > 0; --n) {
2541 if (*s++) return 0;
2542 }
2543 return 1;
2544}
2545
2546static const char *
2547str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2548{
2549 const char *e = s + len;
2550
2551 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2552 if (zero_filled(s, minlen)) return s;
2553 }
2554 return 0;
2555}
2556
2557static char *
2558str_fill_term(VALUE str, char *s, long len, int termlen)
2559{
2560 /* This function assumes that (capa + termlen) bytes of memory
2561 * is allocated, like many other functions in this file.
2562 */
2563 if (str_dependent_p(str)) {
2564 if (!zero_filled(s + len, termlen))
2565 str_make_independent_expand(str, len, 0L, termlen);
2566 }
2567 else {
2568 TERM_FILL(s + len, termlen);
2569 return s;
2570 }
2571 return RSTRING_PTR(str);
2572}
2573
2574void
2575rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2576{
2577 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2578 long len = RSTRING_LEN(str);
2579
2580 assert(capa >= len);
2581 if (capa - len < termlen) {
2582 rb_check_lockedtmp(str);
2583 str_make_independent_expand(str, len, 0L, termlen);
2584 }
2585 else if (str_dependent_p(str)) {
2586 if (termlen > oldtermlen)
2587 str_make_independent_expand(str, len, 0L, termlen);
2588 }
2589 else {
2590 if (!STR_EMBED_P(str)) {
2591 /* modify capa instead of realloc */
2592 assert(!FL_TEST((str), STR_SHARED));
2593 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2594 }
2595 if (termlen > oldtermlen) {
2596 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2597 }
2598 }
2599
2600 return;
2601}
2602
2603static char *
2604str_null_check(VALUE str, int *w)
2605{
2606 char *s = RSTRING_PTR(str);
2607 long len = RSTRING_LEN(str);
2608 rb_encoding *enc = rb_enc_get(str);
2609 const int minlen = rb_enc_mbminlen(enc);
2610
2611 if (minlen > 1) {
2612 *w = 1;
2613 if (str_null_char(s, len, minlen, enc)) {
2614 return NULL;
2615 }
2616 return str_fill_term(str, s, len, minlen);
2617 }
2618 *w = 0;
2619 if (!s || memchr(s, 0, len)) {
2620 return NULL;
2621 }
2622 if (s[len]) {
2623 s = str_fill_term(str, s, len, minlen);
2624 }
2625 return s;
2626}
2627
2628char *
2629rb_str_to_cstr(VALUE str)
2630{
2631 int w;
2632 return str_null_check(str, &w);
2633}
2634
2635char *
2637{
2638 VALUE str = rb_string_value(ptr);
2639 int w;
2640 char *s = str_null_check(str, &w);
2641 if (!s) {
2642 if (w) {
2643 rb_raise(rb_eArgError, "string contains null char");
2644 }
2645 rb_raise(rb_eArgError, "string contains null byte");
2646 }
2647 return s;
2648}
2649
2650char *
2651rb_str_fill_terminator(VALUE str, const int newminlen)
2652{
2653 char *s = RSTRING_PTR(str);
2654 long len = RSTRING_LEN(str);
2655 return str_fill_term(str, s, len, newminlen);
2656}
2657
2658VALUE
2660{
2661 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2662 return str;
2663}
2664
2665/*
2666 * call-seq:
2667 * String.try_convert(object) -> object, new_string, or nil
2668 *
2669 * If +object+ is a \String object, returns +object+.
2670 *
2671 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2672 * calls <tt>object.to_str</tt> and returns the result.
2673 *
2674 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2675 *
2676 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2677 */
2678static VALUE
2679rb_str_s_try_convert(VALUE dummy, VALUE str)
2680{
2681 return rb_check_string_type(str);
2682}
2683
2684static char*
2685str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2686{
2687 long nth = *nthp;
2688 if (rb_enc_mbmaxlen(enc) == 1) {
2689 p += nth;
2690 }
2691 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2692 p += nth * rb_enc_mbmaxlen(enc);
2693 }
2694 else if (rb_enc_asciicompat(enc)) {
2695 const char *p2, *e2;
2696 int n;
2697
2698 while (p < e && 0 < nth) {
2699 e2 = p + nth;
2700 if (e < e2) {
2701 *nthp = nth;
2702 return (char *)e;
2703 }
2704 if (ISASCII(*p)) {
2705 p2 = search_nonascii(p, e2);
2706 if (!p2) {
2707 nth -= e2 - p;
2708 *nthp = nth;
2709 return (char *)e2;
2710 }
2711 nth -= p2 - p;
2712 p = p2;
2713 }
2714 n = rb_enc_mbclen(p, e, enc);
2715 p += n;
2716 nth--;
2717 }
2718 *nthp = nth;
2719 if (nth != 0) {
2720 return (char *)e;
2721 }
2722 return (char *)p;
2723 }
2724 else {
2725 while (p < e && nth--) {
2726 p += rb_enc_mbclen(p, e, enc);
2727 }
2728 }
2729 if (p > e) p = e;
2730 *nthp = nth;
2731 return (char*)p;
2732}
2733
2734char*
2735rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2736{
2737 return str_nth_len(p, e, &nth, enc);
2738}
2739
2740static char*
2741str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2742{
2743 if (singlebyte)
2744 p += nth;
2745 else {
2746 p = str_nth_len(p, e, &nth, enc);
2747 }
2748 if (!p) return 0;
2749 if (p > e) p = e;
2750 return (char *)p;
2751}
2752
2753/* char offset to byte offset */
2754static long
2755str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2756{
2757 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2758 if (!pp) return e - p;
2759 return pp - p;
2760}
2761
2762long
2763rb_str_offset(VALUE str, long pos)
2764{
2765 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2766 STR_ENC_GET(str), single_byte_optimizable(str));
2767}
2768
2769#ifdef NONASCII_MASK
2770static char *
2771str_utf8_nth(const char *p, const char *e, long *nthp)
2772{
2773 long nth = *nthp;
2774 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2775 const uintptr_t *s, *t;
2776 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2777 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2778 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2779 while (p < (const char *)s) {
2780 if (is_utf8_lead_byte(*p)) nth--;
2781 p++;
2782 }
2783 do {
2784 nth -= count_utf8_lead_bytes_with_word(s);
2785 s++;
2786 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2787 p = (char *)s;
2788 }
2789 while (p < e) {
2790 if (is_utf8_lead_byte(*p)) {
2791 if (nth == 0) break;
2792 nth--;
2793 }
2794 p++;
2795 }
2796 *nthp = nth;
2797 return (char *)p;
2798}
2799
2800static long
2801str_utf8_offset(const char *p, const char *e, long nth)
2802{
2803 const char *pp = str_utf8_nth(p, e, &nth);
2804 return pp - p;
2805}
2806#endif
2807
2808/* byte offset to char offset */
2809long
2810rb_str_sublen(VALUE str, long pos)
2811{
2812 if (single_byte_optimizable(str) || pos < 0)
2813 return pos;
2814 else {
2815 char *p = RSTRING_PTR(str);
2816 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2817 }
2818}
2819
2820VALUE
2821rb_str_subseq(VALUE str, long beg, long len)
2822{
2823 VALUE str2;
2824
2825 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2826 SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2827 long olen;
2828 str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2829 RSTRING(str2)->as.heap.ptr += beg;
2830 olen = RSTRING(str2)->as.heap.len;
2831 if (olen > len) RSTRING(str2)->as.heap.len = len;
2832 }
2833 else {
2834 str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2835 RB_GC_GUARD(str);
2836 }
2837
2838 rb_enc_cr_str_copy_for_substr(str2, str);
2839
2840 return str2;
2841}
2842
2843char *
2844rb_str_subpos(VALUE str, long beg, long *lenp)
2845{
2846 long len = *lenp;
2847 long slen = -1L;
2848 long blen = RSTRING_LEN(str);
2849 rb_encoding *enc = STR_ENC_GET(str);
2850 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2851
2852 if (len < 0) return 0;
2853 if (!blen) {
2854 len = 0;
2855 }
2856 if (single_byte_optimizable(str)) {
2857 if (beg > blen) return 0;
2858 if (beg < 0) {
2859 beg += blen;
2860 if (beg < 0) return 0;
2861 }
2862 if (len > blen - beg)
2863 len = blen - beg;
2864 if (len < 0) return 0;
2865 p = s + beg;
2866 goto end;
2867 }
2868 if (beg < 0) {
2869 if (len > -beg) len = -beg;
2870 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2871 beg = -beg;
2872 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2873 p = e;
2874 if (!p) return 0;
2875 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2876 if (!p) return 0;
2877 len = e - p;
2878 goto end;
2879 }
2880 else {
2881 slen = str_strlen(str, enc);
2882 beg += slen;
2883 if (beg < 0) return 0;
2884 p = s + beg;
2885 if (len == 0) goto end;
2886 }
2887 }
2888 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2889 return 0;
2890 }
2891 if (len == 0) {
2892 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2893 p = s + beg;
2894 }
2895#ifdef NONASCII_MASK
2896 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2897 enc == rb_utf8_encoding()) {
2898 p = str_utf8_nth(s, e, &beg);
2899 if (beg > 0) return 0;
2900 len = str_utf8_offset(p, e, len);
2901 }
2902#endif
2903 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2904 int char_sz = rb_enc_mbmaxlen(enc);
2905
2906 p = s + beg * char_sz;
2907 if (p > e) {
2908 return 0;
2909 }
2910 else if (len * char_sz > e - p)
2911 len = e - p;
2912 else
2913 len *= char_sz;
2914 }
2915 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2916 if (beg > 0) return 0;
2917 len = 0;
2918 }
2919 else {
2920 len = str_offset(p, e, len, enc, 0);
2921 }
2922 end:
2923 *lenp = len;
2924 RB_GC_GUARD(str);
2925 return p;
2926}
2927
2928static VALUE str_substr(VALUE str, long beg, long len, int empty);
2929
2930VALUE
2931rb_str_substr(VALUE str, long beg, long len)
2932{
2933 return str_substr(str, beg, len, TRUE);
2934}
2935
2936static VALUE
2937str_substr(VALUE str, long beg, long len, int empty)
2938{
2939 VALUE str2;
2940 char *p = rb_str_subpos(str, beg, &len);
2941
2942 if (!p) return Qnil;
2943 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2944 SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2945 long ofs = p - RSTRING_PTR(str);
2946 str2 = rb_str_new_frozen(str);
2947 str2 = str_new_shared(rb_cString, str2);
2948 RSTRING(str2)->as.heap.ptr += ofs;
2949 RSTRING(str2)->as.heap.len = len;
2950 ENC_CODERANGE_CLEAR(str2);
2951 }
2952 else {
2953 if (!len && !empty) return Qnil;
2954 str2 = rb_str_new(p, len);
2955 RB_GC_GUARD(str);
2956 }
2957 rb_enc_cr_str_copy_for_substr(str2, str);
2958
2959 return str2;
2960}
2961
2962VALUE
2964{
2965 if (OBJ_FROZEN(str)) return str;
2966 rb_str_resize(str, RSTRING_LEN(str));
2967 return rb_obj_freeze(str);
2968}
2969
2970
2971/*
2972 * call-seq:
2973 * +string -> new_string or self
2974 *
2975 * Returns +self+ if +self+ is not frozen.
2976 *
2977 * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2978 */
2979static VALUE
2980str_uplus(VALUE str)
2981{
2982 if (OBJ_FROZEN(str)) {
2983 return rb_str_dup(str);
2984 }
2985 else {
2986 return str;
2987 }
2988}
2989
2990/*
2991 * call-seq:
2992 * -string -> frozen_string
2993 *
2994 * Returns a frozen, possibly pre-existing copy of the string.
2995 *
2996 * The returned \String will be deduplicated as long as it does not have
2997 * any instance variables set on it.
2998 */
2999static VALUE
3000str_uminus(VALUE str)
3001{
3002 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3003 str = rb_str_dup(str);
3004 }
3005 return rb_fstring(str);
3006}
3007
3008RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3009#define rb_str_dup_frozen rb_str_new_frozen
3010
3011VALUE
3013{
3014 if (FL_TEST(str, STR_TMPLOCK)) {
3015 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3016 }
3017 FL_SET(str, STR_TMPLOCK);
3018 return str;
3019}
3020
3021VALUE
3023{
3024 if (!FL_TEST(str, STR_TMPLOCK)) {
3025 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3026 }
3027 FL_UNSET(str, STR_TMPLOCK);
3028 return str;
3029}
3030
3031RUBY_FUNC_EXPORTED VALUE
3032rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3033{
3034 rb_str_locktmp(str);
3035 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3036}
3037
3038void
3040{
3041 long capa;
3042 const int termlen = TERM_LEN(str);
3043
3044 str_modifiable(str);
3045 if (STR_SHARED_P(str)) {
3046 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3047 }
3048 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3049 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3050 }
3051 STR_SET_LEN(str, len);
3052 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3053}
3054
3055VALUE
3057{
3058 long slen;
3059 int independent;
3060
3061 if (len < 0) {
3062 rb_raise(rb_eArgError, "negative string size (or size too big)");
3063 }
3064
3065 independent = str_independent(str);
3067 slen = RSTRING_LEN(str);
3068
3069 {
3070 long capa;
3071 const int termlen = TERM_LEN(str);
3072 if (STR_EMBED_P(str)) {
3073 if (len == slen) return str;
3074 if (str_embed_capa(str) >= len + termlen) {
3075 STR_SET_EMBED_LEN(str, len);
3076 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3077 return str;
3078 }
3079 str_make_independent_expand(str, slen, len - slen, termlen);
3080 }
3081 else if (str_embed_capa(str) >= len + termlen) {
3082 char *ptr = STR_HEAP_PTR(str);
3083 STR_SET_EMBED(str);
3084 if (slen > len) slen = len;
3085 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3086 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3087 STR_SET_EMBED_LEN(str, len);
3088 if (independent) ruby_xfree(ptr);
3089 return str;
3090 }
3091 else if (!independent) {
3092 if (len == slen) return str;
3093 str_make_independent_expand(str, slen, len - slen, termlen);
3094 }
3095 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3096 (capa - len) > (len < 1024 ? len : 1024)) {
3097 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3098 (size_t)len + termlen, STR_HEAP_SIZE(str));
3099 RSTRING(str)->as.heap.aux.capa = len;
3100 }
3101 else if (len == slen) return str;
3102 RSTRING(str)->as.heap.len = len;
3103 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3104 }
3105 return str;
3106}
3107
3108static VALUE
3109str_buf_cat(VALUE str, const char *ptr, long len)
3110{
3111 long capa, total, olen, off = -1;
3112 char *sptr;
3113 const int termlen = TERM_LEN(str);
3114#if !USE_RVARGC
3115 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3116#endif
3117
3118 RSTRING_GETMEM(str, sptr, olen);
3119 if (ptr >= sptr && ptr <= sptr + olen) {
3120 off = ptr - sptr;
3121 }
3122 rb_str_modify(str);
3123 if (len == 0) return 0;
3124 if (STR_EMBED_P(str)) {
3125 capa = str_embed_capa(str) - termlen;
3126 sptr = RSTRING(str)->as.embed.ary;
3127 olen = RSTRING_EMBED_LEN(str);
3128 }
3129 else {
3130 capa = RSTRING(str)->as.heap.aux.capa;
3131 sptr = RSTRING(str)->as.heap.ptr;
3132 olen = RSTRING(str)->as.heap.len;
3133 }
3134 if (olen > LONG_MAX - len) {
3135 rb_raise(rb_eArgError, "string sizes too big");
3136 }
3137 total = olen + len;
3138 if (capa < total) {
3139 if (total >= LONG_MAX / 2) {
3140 capa = total;
3141 }
3142 while (total > capa) {
3143 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3144 }
3145 RESIZE_CAPA_TERM(str, capa, termlen);
3146 sptr = RSTRING_PTR(str);
3147 }
3148 if (off != -1) {
3149 ptr = sptr + off;
3150 }
3151 memcpy(sptr + olen, ptr, len);
3152 STR_SET_LEN(str, total);
3153 TERM_FILL(sptr + total, termlen); /* sentinel */
3154
3155 return str;
3156}
3157
3158#define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3159
3160VALUE
3161rb_str_cat(VALUE str, const char *ptr, long len)
3162{
3163 if (len == 0) return str;
3164 if (len < 0) {
3165 rb_raise(rb_eArgError, "negative string size (or size too big)");
3166 }
3167 return str_buf_cat(str, ptr, len);
3168}
3169
3170VALUE
3171rb_str_cat_cstr(VALUE str, const char *ptr)
3172{
3173 must_not_null(ptr);
3174 return rb_str_buf_cat(str, ptr, strlen(ptr));
3175}
3176
3177RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3178RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3179RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3180
3181static VALUE
3182rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3183 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3184{
3185 int str_encindex = ENCODING_GET(str);
3186 int res_encindex;
3187 int str_cr, res_cr;
3188 rb_encoding *str_enc, *ptr_enc;
3189
3190 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3191
3192 if (str_encindex == ptr_encindex) {
3193 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3194 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3195 }
3196 }
3197 else {
3198 str_enc = rb_enc_from_index(str_encindex);
3199 ptr_enc = rb_enc_from_index(ptr_encindex);
3200 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3201 if (len == 0)
3202 return str;
3203 if (RSTRING_LEN(str) == 0) {
3204 rb_str_buf_cat(str, ptr, len);
3205 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3206 return str;
3207 }
3208 goto incompatible;
3209 }
3210 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3211 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3212 }
3213 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3214 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3215 str_cr = rb_enc_str_coderange(str);
3216 }
3217 }
3218 }
3219 if (ptr_cr_ret)
3220 *ptr_cr_ret = ptr_cr;
3221
3222 if (str_encindex != ptr_encindex &&
3223 str_cr != ENC_CODERANGE_7BIT &&
3224 ptr_cr != ENC_CODERANGE_7BIT) {
3225 str_enc = rb_enc_from_index(str_encindex);
3226 ptr_enc = rb_enc_from_index(ptr_encindex);
3227 goto incompatible;
3228 }
3229
3230 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3231 res_encindex = str_encindex;
3232 res_cr = ENC_CODERANGE_UNKNOWN;
3233 }
3234 else if (str_cr == ENC_CODERANGE_7BIT) {
3235 if (ptr_cr == ENC_CODERANGE_7BIT) {
3236 res_encindex = str_encindex;
3237 res_cr = ENC_CODERANGE_7BIT;
3238 }
3239 else {
3240 res_encindex = ptr_encindex;
3241 res_cr = ptr_cr;
3242 }
3243 }
3244 else if (str_cr == ENC_CODERANGE_VALID) {
3245 res_encindex = str_encindex;
3246 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3247 res_cr = str_cr;
3248 else
3249 res_cr = ptr_cr;
3250 }
3251 else { /* str_cr == ENC_CODERANGE_BROKEN */
3252 res_encindex = str_encindex;
3253 res_cr = str_cr;
3254 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3255 }
3256
3257 if (len < 0) {
3258 rb_raise(rb_eArgError, "negative string size (or size too big)");
3259 }
3260 str_buf_cat(str, ptr, len);
3261 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3262 return str;
3263
3264 incompatible:
3265 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3266 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3268}
3269
3270VALUE
3271rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3272{
3273 return rb_enc_cr_str_buf_cat(str, ptr, len,
3274 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3275}
3276
3277VALUE
3279{
3280 /* ptr must reference NUL terminated ASCII string. */
3281 int encindex = ENCODING_GET(str);
3282 rb_encoding *enc = rb_enc_from_index(encindex);
3283 if (rb_enc_asciicompat(enc)) {
3284 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3285 encindex, ENC_CODERANGE_7BIT, 0);
3286 }
3287 else {
3288 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3289 while (*ptr) {
3290 unsigned int c = (unsigned char)*ptr;
3291 int len = rb_enc_codelen(c, enc);
3292 rb_enc_mbcput(c, buf, enc);
3293 rb_enc_cr_str_buf_cat(str, buf, len,
3294 encindex, ENC_CODERANGE_VALID, 0);
3295 ptr++;
3296 }
3297 return str;
3298 }
3299}
3300
3301VALUE
3303{
3304 int str2_cr;
3305
3306 str2_cr = ENC_CODERANGE(str2);
3307
3308 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3309 ENCODING_GET(str2), str2_cr, &str2_cr);
3310
3311 ENC_CODERANGE_SET(str2, str2_cr);
3312
3313 return str;
3314}
3315
3316VALUE
3318{
3319 StringValue(str2);
3320 return rb_str_buf_append(str, str2);
3321}
3322
3323#define MIN_PRE_ALLOC_SIZE 48
3324
3325MJIT_FUNC_EXPORTED VALUE
3326rb_str_concat_literals(size_t num, const VALUE *strary)
3327{
3328 VALUE str;
3329 size_t i, s;
3330 long len = 1;
3331
3332 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3333 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3334
3335 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3336 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3337 str = rb_str_resurrect(strary[0]);
3338 s = 1;
3339 }
3340 else {
3341 str = rb_str_buf_new(len);
3342 rb_enc_copy(str, strary[0]);
3343 s = 0;
3344 }
3345
3346 for (i = s; i < num; ++i) {
3347 const VALUE v = strary[i];
3348 int encidx = ENCODING_GET(v);
3349
3350 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3351 encidx, ENC_CODERANGE(v), NULL);
3352 if (encidx != ENCINDEX_US_ASCII) {
3353 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3354 rb_enc_set_index(str, encidx);
3355 }
3356 }
3357 return str;
3358}
3359
3360/*
3361 * call-seq:
3362 * concat(*objects) -> string
3363 *
3364 * Concatenates each object in +objects+ to +self+ and returns +self+:
3365 *
3366 * s = 'foo'
3367 * s.concat('bar', 'baz') # => "foobarbaz"
3368 * s # => "foobarbaz"
3369 *
3370 * For each given object +object+ that is an \Integer,
3371 * the value is considered a codepoint and converted to a character before concatenation:
3372 *
3373 * s = 'foo'
3374 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3375 *
3376 * Related: String#<<, which takes a single argument.
3377 */
3378static VALUE
3379rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3380{
3381 str_modifiable(str);
3382
3383 if (argc == 1) {
3384 return rb_str_concat(str, argv[0]);
3385 }
3386 else if (argc > 1) {
3387 int i;
3388 VALUE arg_str = rb_str_tmp_new(0);
3389 rb_enc_copy(arg_str, str);
3390 for (i = 0; i < argc; i++) {
3391 rb_str_concat(arg_str, argv[i]);
3392 }
3393 rb_str_buf_append(str, arg_str);
3394 }
3395
3396 return str;
3397}
3398
3399/*
3400 * call-seq:
3401 * string << object -> string
3402 *
3403 * Concatenates +object+ to +self+ and returns +self+:
3404 *
3405 * s = 'foo'
3406 * s << 'bar' # => "foobar"
3407 * s # => "foobar"
3408 *
3409 * If +object+ is an \Integer,
3410 * the value is considered a codepoint and converted to a character before concatenation:
3411 *
3412 * s = 'foo'
3413 * s << 33 # => "foo!"
3414 *
3415 * Related: String#concat, which takes multiple arguments.
3416 */
3417VALUE
3419{
3420 unsigned int code;
3421 rb_encoding *enc = STR_ENC_GET(str1);
3422 int encidx;
3423
3424 if (RB_INTEGER_TYPE_P(str2)) {
3425 if (rb_num_to_uint(str2, &code) == 0) {
3426 }
3427 else if (FIXNUM_P(str2)) {
3428 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3429 }
3430 else {
3431 rb_raise(rb_eRangeError, "bignum out of char range");
3432 }
3433 }
3434 else {
3435 return rb_str_append(str1, str2);
3436 }
3437
3438 encidx = rb_enc_to_index(enc);
3439 if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3440 /* US-ASCII automatically extended to ASCII-8BIT */
3441 char buf[1];
3442 buf[0] = (char)code;
3443 if (code > 0xFF) {
3444 rb_raise(rb_eRangeError, "%u out of char range", code);
3445 }
3446 rb_str_cat(str1, buf, 1);
3447 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3448 rb_enc_associate_index(str1, ENCINDEX_ASCII);
3450 }
3451 }
3452 else {
3453 long pos = RSTRING_LEN(str1);
3454 int cr = ENC_CODERANGE(str1);
3455 int len;
3456 char *buf;
3457
3458 switch (len = rb_enc_codelen(code, enc)) {
3459 case ONIGERR_INVALID_CODE_POINT_VALUE:
3460 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3461 break;
3462 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3463 case 0:
3464 rb_raise(rb_eRangeError, "%u out of char range", code);
3465 break;
3466 }
3467 buf = ALLOCA_N(char, len + 1);
3468 rb_enc_mbcput(code, buf, enc);
3469 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3470 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3471 }
3472 rb_str_resize(str1, pos+len);
3473 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3474 if (cr == ENC_CODERANGE_7BIT && code > 127)
3476 ENC_CODERANGE_SET(str1, cr);
3477 }
3478 return str1;
3479}
3480
3481/*
3482 * call-seq:
3483 * prepend(*other_strings) -> string
3484 *
3485 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3486 *
3487 * s = 'foo'
3488 * s.prepend('bar', 'baz') # => "barbazfoo"
3489 * s # => "barbazfoo"
3490 *
3491 * Related: String#concat.
3492 */
3493
3494static VALUE
3495rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3496{
3497 str_modifiable(str);
3498
3499 if (argc == 1) {
3500 rb_str_update(str, 0L, 0L, argv[0]);
3501 }
3502 else if (argc > 1) {
3503 int i;
3504 VALUE arg_str = rb_str_tmp_new(0);
3505 rb_enc_copy(arg_str, str);
3506 for (i = 0; i < argc; i++) {
3507 rb_str_append(arg_str, argv[i]);
3508 }
3509 rb_str_update(str, 0L, 0L, arg_str);
3510 }
3511
3512 return str;
3513}
3514
3515st_index_t
3517{
3518 int e = ENCODING_GET(str);
3519 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3520 e = 0;
3521 }
3522 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3523}
3524
3525int
3527{
3528 long len1, len2;
3529 const char *ptr1, *ptr2;
3530 RSTRING_GETMEM(str1, ptr1, len1);
3531 RSTRING_GETMEM(str2, ptr2, len2);
3532 return (len1 != len2 ||
3533 !rb_str_comparable(str1, str2) ||
3534 memcmp(ptr1, ptr2, len1) != 0);
3535}
3536
3537/*
3538 * call-seq:
3539 * hash -> integer
3540 *
3541 * Returns the integer hash value for +self+.
3542 * The value is based on the length, content and encoding of +self+.
3543 *
3544 * Related: Object#hash.
3545 */
3546
3547static VALUE
3548rb_str_hash_m(VALUE str)
3549{
3550 st_index_t hval = rb_str_hash(str);
3551 return ST2FIX(hval);
3552}
3553
3554#define lesser(a,b) (((a)>(b))?(b):(a))
3555
3556int
3558{
3559 int idx1, idx2;
3560 int rc1, rc2;
3561
3562 if (RSTRING_LEN(str1) == 0) return TRUE;
3563 if (RSTRING_LEN(str2) == 0) return TRUE;
3564 idx1 = ENCODING_GET(str1);
3565 idx2 = ENCODING_GET(str2);
3566 if (idx1 == idx2) return TRUE;
3567 rc1 = rb_enc_str_coderange(str1);
3568 rc2 = rb_enc_str_coderange(str2);
3569 if (rc1 == ENC_CODERANGE_7BIT) {
3570 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3571 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3572 return TRUE;
3573 }
3574 if (rc2 == ENC_CODERANGE_7BIT) {
3575 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3576 return TRUE;
3577 }
3578 return FALSE;
3579}
3580
3581int
3583{
3584 long len1, len2;
3585 const char *ptr1, *ptr2;
3586 int retval;
3587
3588 if (str1 == str2) return 0;
3589 RSTRING_GETMEM(str1, ptr1, len1);
3590 RSTRING_GETMEM(str2, ptr2, len2);
3591 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3592 if (len1 == len2) {
3593 if (!rb_str_comparable(str1, str2)) {
3594 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3595 return 1;
3596 return -1;
3597 }
3598 return 0;
3599 }
3600 if (len1 > len2) return 1;
3601 return -1;
3602 }
3603 if (retval > 0) return 1;
3604 return -1;
3605}
3606
3607/*
3608 * call-seq:
3609 * string == object -> true or false
3610 * string === object -> true or false
3611 *
3612 * Returns +true+ if +object+ has the same length and content;
3613 * as +self+; +false+ otherwise:
3614 *
3615 * s = 'foo'
3616 * s == 'foo' # => true
3617 * s == 'food' # => false
3618 * s == 'FOO' # => false
3619 *
3620 * Returns +false+ if the two strings' encodings are not compatible:
3621 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3622 *
3623 * If +object+ is not an instance of \String but responds to +to_str+, then the
3624 * two strings are compared using <code>object.==</code>.
3625 */
3626
3627VALUE
3629{
3630 if (str1 == str2) return Qtrue;
3631 if (!RB_TYPE_P(str2, T_STRING)) {
3632 if (!rb_respond_to(str2, idTo_str)) {
3633 return Qfalse;
3634 }
3635 return rb_equal(str2, str1);
3636 }
3637 return rb_str_eql_internal(str1, str2);
3638}
3639
3640/*
3641 * call-seq:
3642 * eql?(object) -> true or false
3643 *
3644 * Returns +true+ if +object+ has the same length and content;
3645 * as +self+; +false+ otherwise:
3646 *
3647 * s = 'foo'
3648 * s.eql?('foo') # => true
3649 * s.eql?('food') # => false
3650 * s.eql?('FOO') # => false
3651 *
3652 * Returns +false+ if the two strings' encodings are not compatible:
3653 *
3654 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3655 *
3656 */
3657
3658MJIT_FUNC_EXPORTED VALUE
3659rb_str_eql(VALUE str1, VALUE str2)
3660{
3661 if (str1 == str2) return Qtrue;
3662 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3663 return rb_str_eql_internal(str1, str2);
3664}
3665
3666/*
3667 * call-seq:
3668 * string <=> other_string -> -1, 0, 1, or nil
3669 *
3670 * Compares +self+ and +other_string+, returning:
3671 *
3672 * - -1 if +other_string+ is larger.
3673 * - 0 if the two are equal.
3674 * - 1 if +other_string+ is smaller.
3675 * - +nil+ if the two are incomparable.
3676 *
3677 * Examples:
3678 *
3679 * 'foo' <=> 'foo' # => 0
3680 * 'foo' <=> 'food' # => -1
3681 * 'food' <=> 'foo' # => 1
3682 * 'FOO' <=> 'foo' # => -1
3683 * 'foo' <=> 'FOO' # => 1
3684 * 'foo' <=> 1 # => nil
3685 *
3686 */
3687
3688static VALUE
3689rb_str_cmp_m(VALUE str1, VALUE str2)
3690{
3691 int result;
3692 VALUE s = rb_check_string_type(str2);
3693 if (NIL_P(s)) {
3694 return rb_invcmp(str1, str2);
3695 }
3696 result = rb_str_cmp(str1, s);
3697 return INT2FIX(result);
3698}
3699
3700static VALUE str_casecmp(VALUE str1, VALUE str2);
3701static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3702
3703/*
3704 * call-seq:
3705 * casecmp(other_string) -> -1, 0, 1, or nil
3706 *
3707 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3708 *
3709 * - -1 if <tt>other_string.downcase</tt> is larger.
3710 * - 0 if the two are equal.
3711 * - 1 if <tt>other_string.downcase</tt> is smaller.
3712 * - +nil+ if the two are incomparable.
3713 *
3714 * Examples:
3715 *
3716 * 'foo'.casecmp('foo') # => 0
3717 * 'foo'.casecmp('food') # => -1
3718 * 'food'.casecmp('foo') # => 1
3719 * 'FOO'.casecmp('foo') # => 0
3720 * 'foo'.casecmp('FOO') # => 0
3721 * 'foo'.casecmp(1) # => nil
3722 *
3723 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3724 *
3725 * Related: String#casecmp?.
3726 *
3727 */
3728
3729static VALUE
3730rb_str_casecmp(VALUE str1, VALUE str2)
3731{
3732 VALUE s = rb_check_string_type(str2);
3733 if (NIL_P(s)) {
3734 return Qnil;
3735 }
3736 return str_casecmp(str1, s);
3737}
3738
3739static VALUE
3740str_casecmp(VALUE str1, VALUE str2)
3741{
3742 long len;
3743 rb_encoding *enc;
3744 const char *p1, *p1end, *p2, *p2end;
3745
3746 enc = rb_enc_compatible(str1, str2);
3747 if (!enc) {
3748 return Qnil;
3749 }
3750
3751 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3752 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3753 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3754 while (p1 < p1end && p2 < p2end) {
3755 if (*p1 != *p2) {
3756 unsigned int c1 = TOLOWER(*p1 & 0xff);
3757 unsigned int c2 = TOLOWER(*p2 & 0xff);
3758 if (c1 != c2)
3759 return INT2FIX(c1 < c2 ? -1 : 1);
3760 }
3761 p1++;
3762 p2++;
3763 }
3764 }
3765 else {
3766 while (p1 < p1end && p2 < p2end) {
3767 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3768 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3769
3770 if (0 <= c1 && 0 <= c2) {
3771 c1 = TOLOWER(c1);
3772 c2 = TOLOWER(c2);
3773 if (c1 != c2)
3774 return INT2FIX(c1 < c2 ? -1 : 1);
3775 }
3776 else {
3777 int r;
3778 l1 = rb_enc_mbclen(p1, p1end, enc);
3779 l2 = rb_enc_mbclen(p2, p2end, enc);
3780 len = l1 < l2 ? l1 : l2;
3781 r = memcmp(p1, p2, len);
3782 if (r != 0)
3783 return INT2FIX(r < 0 ? -1 : 1);
3784 if (l1 != l2)
3785 return INT2FIX(l1 < l2 ? -1 : 1);
3786 }
3787 p1 += l1;
3788 p2 += l2;
3789 }
3790 }
3791 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3792 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3793 return INT2FIX(-1);
3794}
3795
3796/*
3797 * call-seq:
3798 * casecmp?(other_string) -> true, false, or nil
3799 *
3800 * Returns +true+ if +self+ and +other_string+ are equal after
3801 * Unicode case folding, otherwise +false+:
3802 *
3803 * 'foo'.casecmp?('foo') # => true
3804 * 'foo'.casecmp?('food') # => false
3805 * 'food'.casecmp?('foo') # => false
3806 * 'FOO'.casecmp?('foo') # => true
3807 * 'foo'.casecmp?('FOO') # => true
3808 *
3809 * Returns +nil+ if the two values are incomparable:
3810 *
3811 * 'foo'.casecmp?(1) # => nil
3812 *
3813 * See {Case Mapping}[doc/case_mapping_rdoc.html].
3814 *
3815 * Related: String#casecmp.
3816 *
3817 */
3818
3819static VALUE
3820rb_str_casecmp_p(VALUE str1, VALUE str2)
3821{
3822 VALUE s = rb_check_string_type(str2);
3823 if (NIL_P(s)) {
3824 return Qnil;
3825 }
3826 return str_casecmp_p(str1, s);
3827}
3828
3829static VALUE
3830str_casecmp_p(VALUE str1, VALUE str2)
3831{
3832 rb_encoding *enc;
3833 VALUE folded_str1, folded_str2;
3834 VALUE fold_opt = sym_fold;
3835
3836 enc = rb_enc_compatible(str1, str2);
3837 if (!enc) {
3838 return Qnil;
3839 }
3840
3841 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3842 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3843
3844 return rb_str_eql(folded_str1, folded_str2);
3845}
3846
3847static long
3848strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3849 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3850{
3851 const char *search_start = str_ptr;
3852 long pos, search_len = str_len - offset;
3853
3854 for (;;) {
3855 const char *t;
3856 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3857 if (pos < 0) return pos;
3858 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3859 if (t == search_start + pos) break;
3860 search_len -= t - search_start;
3861 if (search_len <= 0) return -1;
3862 offset += t - search_start;
3863 search_start = t;
3864 }
3865 return pos + offset;
3866}
3867
3868#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3869
3870static long
3871rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3872{
3873 const char *str_ptr, *str_ptr_end, *sub_ptr;
3874 long str_len, sub_len;
3875 rb_encoding *enc;
3876
3877 enc = rb_enc_check(str, sub);
3878 if (is_broken_string(sub)) return -1;
3879
3880 str_ptr = RSTRING_PTR(str);
3881 str_ptr_end = RSTRING_END(str);
3882 str_len = RSTRING_LEN(str);
3883 sub_ptr = RSTRING_PTR(sub);
3884 sub_len = RSTRING_LEN(sub);
3885
3886 if (str_len < sub_len) return -1;
3887
3888 if (offset != 0) {
3889 long str_len_char, sub_len_char;
3890 int single_byte = single_byte_optimizable(str);
3891 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3892 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3893 if (offset < 0) {
3894 offset += str_len_char;
3895 if (offset < 0) return -1;
3896 }
3897 if (str_len_char - offset < sub_len_char) return -1;
3898 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3899 str_ptr += offset;
3900 }
3901 if (sub_len == 0) return offset;
3902
3903 /* need proceed one character at a time */
3904 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3905}
3906
3907
3908/*
3909 * call-seq:
3910 * index(substring, offset = 0) -> integer or nil
3911 * index(regexp, offset = 0) -> integer or nil
3912 *
3913 * Returns the \Integer index of the first occurrence of the given +substring+,
3914 * or +nil+ if none found:
3915 *
3916 * 'foo'.index('f') # => 0
3917 * 'foo'.index('o') # => 1
3918 * 'foo'.index('oo') # => 1
3919 * 'foo'.index('ooo') # => nil
3920 *
3921 * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3922 * or +nil+ if none found:
3923 *
3924 * 'foo'.index(/f/) # => 0
3925 * 'foo'.index(/o/) # => 1
3926 * 'foo'.index(/oo/) # => 1
3927 * 'foo'.index(/ooo/) # => nil
3928 *
3929 * \Integer argument +offset+, if given, specifies the position in the
3930 * string to begin the search:
3931 *
3932 * 'foo'.index('o', 1) # => 1
3933 * 'foo'.index('o', 2) # => 2
3934 * 'foo'.index('o', 3) # => nil
3935 *
3936 * If +offset+ is negative, counts backward from the end of +self+:
3937 *
3938 * 'foo'.index('o', -1) # => 2
3939 * 'foo'.index('o', -2) # => 1
3940 * 'foo'.index('o', -3) # => 1
3941 * 'foo'.index('o', -4) # => nil
3942 *
3943 * Related: String#rindex.
3944 */
3945
3946static VALUE
3947rb_str_index_m(int argc, VALUE *argv, VALUE str)
3948{
3949 VALUE sub;
3950 VALUE initpos;
3951 long pos;
3952
3953 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3954 pos = NUM2LONG(initpos);
3955 }
3956 else {
3957 pos = 0;
3958 }
3959 if (pos < 0) {
3960 pos += str_strlen(str, NULL);
3961 if (pos < 0) {
3962 if (RB_TYPE_P(sub, T_REGEXP)) {
3964 }
3965 return Qnil;
3966 }
3967 }
3968
3969 if (RB_TYPE_P(sub, T_REGEXP)) {
3970 if (pos > str_strlen(str, NULL))
3971 return Qnil;
3972 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3973 rb_enc_check(str, sub), single_byte_optimizable(str));
3974
3975 if (rb_reg_search(sub, str, pos, 0) < 0) {
3976 return Qnil;
3977 }
3978 else {
3979 VALUE match = rb_backref_get();
3980 struct re_registers *regs = RMATCH_REGS(match);
3981 pos = rb_str_sublen(str, BEG(0));
3982 return LONG2NUM(pos);
3983 }
3984 }
3985 else {
3986 StringValue(sub);
3987 pos = rb_str_index(str, sub, pos);
3988 pos = rb_str_sublen(str, pos);
3989 }
3990
3991 if (pos == -1) return Qnil;
3992 return LONG2NUM(pos);
3993}
3994
3995#ifdef HAVE_MEMRCHR
3996static long
3997str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3998{
3999 char *hit, *adjusted;
4000 int c;
4001 long slen, searchlen;
4002 char *sbeg, *e, *t;
4003
4004 slen = RSTRING_LEN(sub);
4005 if (slen == 0) return pos;
4006 sbeg = RSTRING_PTR(str);
4007 e = RSTRING_END(str);
4008 t = RSTRING_PTR(sub);
4009 c = *t & 0xff;
4010 searchlen = s - sbeg + 1;
4011
4012 do {
4013 hit = memrchr(sbeg, c, searchlen);
4014 if (!hit) break;
4015 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4016 if (hit != adjusted) {
4017 searchlen = adjusted - sbeg;
4018 continue;
4019 }
4020 if (memcmp(hit, t, slen) == 0)
4021 return rb_str_sublen(str, hit - sbeg);
4022 searchlen = adjusted - sbeg;
4023 } while (searchlen > 0);
4024
4025 return -1;
4026}
4027#else
4028static long
4029str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
4030{
4031 long slen;
4032 char *sbeg, *e, *t;
4033
4034 sbeg = RSTRING_PTR(str);
4035 e = RSTRING_END(str);
4036 t = RSTRING_PTR(sub);
4037 slen = RSTRING_LEN(sub);
4038
4039 while (s) {
4040 if (memcmp(s, t, slen) == 0) {
4041 return pos;
4042 }
4043 if (pos == 0) break;
4044 pos--;
4045 s = rb_enc_prev_char(sbeg, s, e, enc);
4046 }
4047
4048 return -1;
4049}
4050#endif
4051
4052static long
4053rb_str_rindex(VALUE str, VALUE sub, long pos)
4054{
4055 long len, slen;
4056 char *sbeg, *s;
4057 rb_encoding *enc;
4058 int singlebyte;
4059
4060 enc = rb_enc_check(str, sub);
4061 if (is_broken_string(sub)) return -1;
4062 singlebyte = single_byte_optimizable(str);
4063 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4064 slen = str_strlen(sub, enc); /* rb_enc_check */
4065
4066 /* substring longer than string */
4067 if (len < slen) return -1;
4068 if (len - pos < slen) pos = len - slen;
4069 if (len == 0) return pos;
4070
4071 sbeg = RSTRING_PTR(str);
4072
4073 if (pos == 0) {
4074 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4075 return 0;
4076 else
4077 return -1;
4078 }
4079
4080 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4081 return str_rindex(str, sub, s, pos, enc);
4082}
4083
4084/*
4085 * call-seq:
4086 * rindex(substring, offset = self.length) -> integer or nil
4087 * rindex(regexp, offset = self.length) -> integer or nil
4088 *
4089 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4090 * or +nil+ if none found:
4091 *
4092 * 'foo'.rindex('f') # => 0
4093 * 'foo'.rindex('o') # => 2
4094 * 'foo'.rindex('oo') # => 1
4095 * 'foo'.rindex('ooo') # => nil
4096 *
4097 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4098 * or +nil+ if none found:
4099 *
4100 * 'foo'.rindex(/f/) # => 0
4101 * 'foo'.rindex(/o/) # => 2
4102 * 'foo'.rindex(/oo/) # => 1
4103 * 'foo'.rindex(/ooo/) # => nil
4104 *
4105 * The _last_ match means starting at the possible last position, not
4106 * the last of longest matches.
4107 *
4108 * 'foo'.rindex(/o+/) # => 2
4109 * $~ #=> #<MatchData "o">
4110 *
4111 * To get the last longest match, needs to combine with negative
4112 * lookbehind.
4113 *
4114 * 'foo'.rindex(/(?<!o)o+/) # => 1
4115 * $~ #=> #<MatchData "oo">
4116 *
4117 * Or String#index with negative lookforward.
4118 *
4119 * 'foo'.index(/o+(?!.*o)/) # => 1
4120 * $~ #=> #<MatchData "oo">
4121 *
4122 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4123 * string to _end_ the search:
4124 *
4125 * 'foo'.rindex('o', 0) # => nil
4126 * 'foo'.rindex('o', 1) # => 1
4127 * 'foo'.rindex('o', 2) # => 2
4128 * 'foo'.rindex('o', 3) # => 2
4129 *
4130 * If +offset+ is a negative \Integer, the maximum starting position in the
4131 * string to _end_ the search is the sum of the string's length and +offset+:
4132 *
4133 * 'foo'.rindex('o', -1) # => 2
4134 * 'foo'.rindex('o', -2) # => 1
4135 * 'foo'.rindex('o', -3) # => nil
4136 * 'foo'.rindex('o', -4) # => nil
4137 *
4138 * Related: String#index.
4139 */
4140
4141static VALUE
4142rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4143{
4144 VALUE sub;
4145 VALUE vpos;
4146 rb_encoding *enc = STR_ENC_GET(str);
4147 long pos, len = str_strlen(str, enc); /* str's enc */
4148
4149 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4150 pos = NUM2LONG(vpos);
4151 if (pos < 0) {
4152 pos += len;
4153 if (pos < 0) {
4154 if (RB_TYPE_P(sub, T_REGEXP)) {
4156 }
4157 return Qnil;
4158 }
4159 }
4160 if (pos > len) pos = len;
4161 }
4162 else {
4163 pos = len;
4164 }
4165
4166 if (RB_TYPE_P(sub, T_REGEXP)) {
4167 /* enc = rb_get_check(str, sub); */
4168 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4169 enc, single_byte_optimizable(str));
4170
4171 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4172 VALUE match = rb_backref_get();
4173 struct re_registers *regs = RMATCH_REGS(match);
4174 pos = rb_str_sublen(str, BEG(0));
4175 return LONG2NUM(pos);
4176 }
4177 }
4178 else {
4179 StringValue(sub);
4180 pos = rb_str_rindex(str, sub, pos);
4181 if (pos >= 0) return LONG2NUM(pos);
4182 }
4183 return Qnil;
4184}
4185
4186/*
4187 * call-seq:
4188 * string =~ regexp -> integer or nil
4189 * string =~ object -> integer or nil
4190 *
4191 * Returns the \Integer index of the first substring that matches
4192 * the given +regexp+, or +nil+ if no match found:
4193 *
4194 * 'foo' =~ /f/ # => 0
4195 * 'foo' =~ /o/ # => 1
4196 * 'foo' =~ /x/ # => nil
4197 *
4198 * Note: also updates
4199 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4200 *
4201 * If the given +object+ is not a \Regexp, returns the value
4202 * returned by <tt>object =~ self</tt>.
4203 *
4204 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4205 * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4206 *
4207 * number= nil
4208 * "no. 9" =~ /(?<number>\d+)/
4209 * number # => nil (not assigned)
4210 * /(?<number>\d+)/ =~ "no. 9"
4211 * number #=> "9"
4212 *
4213 */
4214
4215static VALUE
4216rb_str_match(VALUE x, VALUE y)
4217{
4218 switch (OBJ_BUILTIN_TYPE(y)) {
4219 case T_STRING:
4220 rb_raise(rb_eTypeError, "type mismatch: String given");
4221
4222 case T_REGEXP:
4223 return rb_reg_match(y, x);
4224
4225 default:
4226 return rb_funcall(y, idEqTilde, 1, x);
4227 }
4228}
4229
4230
4231static VALUE get_pat(VALUE);
4232
4233
4234/*
4235 * call-seq:
4236 * match(pattern, offset = 0) -> matchdata or nil
4237 * match(pattern, offset = 0) {|matchdata| ... } -> object
4238 *
4239 * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4240 *
4241 * Note: also updates
4242 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4243 *
4244 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4245 * regexp = Regexp.new(pattern)
4246 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4247 * (see Regexp#match):
4248 * matchdata = <tt>regexp.match(self)
4249 *
4250 * With no block given, returns the computed +matchdata+:
4251 *
4252 * 'foo'.match('f') # => #<MatchData "f">
4253 * 'foo'.match('o') # => #<MatchData "o">
4254 * 'foo'.match('x') # => nil
4255 *
4256 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4257 *
4258 * 'foo'.match('f', 1) # => nil
4259 * 'foo'.match('o', 1) # => #<MatchData "o">
4260 *
4261 * With a block given, calls the block with the computed +matchdata+
4262 * and returns the block's return value:
4263 *
4264 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4265 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4266 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4267 *
4268 */
4269
4270static VALUE
4271rb_str_match_m(int argc, VALUE *argv, VALUE str)
4272{
4273 VALUE re, result;
4274 if (argc < 1)
4275 rb_check_arity(argc, 1, 2);
4276 re = argv[0];
4277 argv[0] = str;
4278 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4279 if (!NIL_P(result) && rb_block_given_p()) {
4280 return rb_yield(result);
4281 }
4282 return result;
4283}
4284
4285/*
4286 * call-seq:
4287 * match?(pattern, offset = 0) -> true or false
4288 *
4289 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4290 *
4291 * Note: does not update
4292 * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4293 *
4294 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4295 * regexp = Regexp.new(pattern)
4296 *
4297 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4298 * +false+ otherwise:
4299 *
4300 * 'foo'.match?(/o/) # => true
4301 * 'foo'.match?('o') # => true
4302 * 'foo'.match?(/x/) # => false
4303 *
4304 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4305 * 'foo'.match?('f', 1) # => false
4306 * 'foo'.match?('o', 1) # => true
4307 *
4308 */
4309
4310static VALUE
4311rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4312{
4313 VALUE re;
4314 rb_check_arity(argc, 1, 2);
4315 re = get_pat(argv[0]);
4316 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4317}
4318
4319enum neighbor_char {
4320 NEIGHBOR_NOT_CHAR,
4321 NEIGHBOR_FOUND,
4322 NEIGHBOR_WRAPPED
4323};
4324
4325static enum neighbor_char
4326enc_succ_char(char *p, long len, rb_encoding *enc)
4327{
4328 long i;
4329 int l;
4330
4331 if (rb_enc_mbminlen(enc) > 1) {
4332 /* wchar, trivial case */
4333 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4334 if (!MBCLEN_CHARFOUND_P(r)) {
4335 return NEIGHBOR_NOT_CHAR;
4336 }
4337 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4338 l = rb_enc_code_to_mbclen(c, enc);
4339 if (!l) return NEIGHBOR_NOT_CHAR;
4340 if (l != len) return NEIGHBOR_WRAPPED;
4341 rb_enc_mbcput(c, p, enc);
4342 r = rb_enc_precise_mbclen(p, p + len, enc);
4343 if (!MBCLEN_CHARFOUND_P(r)) {
4344 return NEIGHBOR_NOT_CHAR;
4345 }
4346 return NEIGHBOR_FOUND;
4347 }
4348 while (1) {
4349 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4350 p[i] = '\0';
4351 if (i < 0)
4352 return NEIGHBOR_WRAPPED;
4353 ++((unsigned char*)p)[i];
4354 l = rb_enc_precise_mbclen(p, p+len, enc);
4355 if (MBCLEN_CHARFOUND_P(l)) {
4356 l = MBCLEN_CHARFOUND_LEN(l);
4357 if (l == len) {
4358 return NEIGHBOR_FOUND;
4359 }
4360 else {
4361 memset(p+l, 0xff, len-l);
4362 }
4363 }
4364 if (MBCLEN_INVALID_P(l) && i < len-1) {
4365 long len2;
4366 int l2;
4367 for (len2 = len-1; 0 < len2; len2--) {
4368 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4369 if (!MBCLEN_INVALID_P(l2))
4370 break;
4371 }
4372 memset(p+len2+1, 0xff, len-(len2+1));
4373 }
4374 }
4375}
4376
4377static enum neighbor_char
4378enc_pred_char(char *p, long len, rb_encoding *enc)
4379{
4380 long i;
4381 int l;
4382 if (rb_enc_mbminlen(enc) > 1) {
4383 /* wchar, trivial case */
4384 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4385 if (!MBCLEN_CHARFOUND_P(r)) {
4386 return NEIGHBOR_NOT_CHAR;
4387 }
4388 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4389 if (!c) return NEIGHBOR_NOT_CHAR;
4390 --c;
4391 l = rb_enc_code_to_mbclen(c, enc);
4392 if (!l) return NEIGHBOR_NOT_CHAR;
4393 if (l != len) return NEIGHBOR_WRAPPED;
4394 rb_enc_mbcput(c, p, enc);
4395 r = rb_enc_precise_mbclen(p, p + len, enc);
4396 if (!MBCLEN_CHARFOUND_P(r)) {
4397 return NEIGHBOR_NOT_CHAR;
4398 }
4399 return NEIGHBOR_FOUND;
4400 }
4401 while (1) {
4402 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4403 p[i] = '\xff';
4404 if (i < 0)
4405 return NEIGHBOR_WRAPPED;
4406 --((unsigned char*)p)[i];
4407 l = rb_enc_precise_mbclen(p, p+len, enc);
4408 if (MBCLEN_CHARFOUND_P(l)) {
4409 l = MBCLEN_CHARFOUND_LEN(l);
4410 if (l == len) {
4411 return NEIGHBOR_FOUND;
4412 }
4413 else {
4414 memset(p+l, 0, len-l);
4415 }
4416 }
4417 if (MBCLEN_INVALID_P(l) && i < len-1) {
4418 long len2;
4419 int l2;
4420 for (len2 = len-1; 0 < len2; len2--) {
4421 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4422 if (!MBCLEN_INVALID_P(l2))
4423 break;
4424 }
4425 memset(p+len2+1, 0, len-(len2+1));
4426 }
4427 }
4428}
4429
4430/*
4431 overwrite +p+ by succeeding letter in +enc+ and returns
4432 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4433 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4434 assuming each ranges are successive, and mbclen
4435 never change in each ranges.
4436 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4437 character.
4438 */
4439static enum neighbor_char
4440enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4441{
4442 enum neighbor_char ret;
4443 unsigned int c;
4444 int ctype;
4445 int range;
4446 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4447
4448 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4449 int try;
4450 const int max_gaps = 1;
4451
4452 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4453 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4454 ctype = ONIGENC_CTYPE_DIGIT;
4455 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4456 ctype = ONIGENC_CTYPE_ALPHA;
4457 else
4458 return NEIGHBOR_NOT_CHAR;
4459
4460 MEMCPY(save, p, char, len);
4461 for (try = 0; try <= max_gaps; ++try) {
4462 ret = enc_succ_char(p, len, enc);
4463 if (ret == NEIGHBOR_FOUND) {
4464 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4465 if (rb_enc_isctype(c, ctype, enc))
4466 return NEIGHBOR_FOUND;
4467 }
4468 }
4469 MEMCPY(p, save, char, len);
4470 range = 1;
4471 while (1) {
4472 MEMCPY(save, p, char, len);
4473 ret = enc_pred_char(p, len, enc);
4474 if (ret == NEIGHBOR_FOUND) {
4475 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4476 if (!rb_enc_isctype(c, ctype, enc)) {
4477 MEMCPY(p, save, char, len);
4478 break;
4479 }
4480 }
4481 else {
4482 MEMCPY(p, save, char, len);
4483 break;
4484 }
4485 range++;
4486 }
4487 if (range == 1) {
4488 return NEIGHBOR_NOT_CHAR;
4489 }
4490
4491 if (ctype != ONIGENC_CTYPE_DIGIT) {
4492 MEMCPY(carry, p, char, len);
4493 return NEIGHBOR_WRAPPED;
4494 }
4495
4496 MEMCPY(carry, p, char, len);
4497 enc_succ_char(carry, len, enc);
4498 return NEIGHBOR_WRAPPED;
4499}
4500
4501
4502static VALUE str_succ(VALUE str);
4503
4504/*
4505 * call-seq:
4506 * succ -> new_str
4507 *
4508 * Returns the successor to +self+. The successor is calculated by
4509 * incrementing characters.
4510 *
4511 * The first character to be incremented is the rightmost alphanumeric:
4512 * or, if no alphanumerics, the rightmost character:
4513 *
4514 * 'THX1138'.succ # => "THX1139"
4515 * '<<koala>>'.succ # => "<<koalb>>"
4516 * '***'.succ # => '**+'
4517 *
4518 * The successor to a digit is another digit, "carrying" to the next-left
4519 * character for a "rollover" from 9 to 0, and prepending another digit
4520 * if necessary:
4521 *
4522 * '00'.succ # => "01"
4523 * '09'.succ # => "10"
4524 * '99'.succ # => "100"
4525 *
4526 * The successor to a letter is another letter of the same case,
4527 * carrying to the next-left character for a rollover,
4528 * and prepending another same-case letter if necessary:
4529 *
4530 * 'aa'.succ # => "ab"
4531 * 'az'.succ # => "ba"
4532 * 'zz'.succ # => "aaa"
4533 * 'AA'.succ # => "AB"
4534 * 'AZ'.succ # => "BA"
4535 * 'ZZ'.succ # => "AAA"
4536 *
4537 * The successor to a non-alphanumeric character is the next character
4538 * in the underlying character set's collating sequence,
4539 * carrying to the next-left character for a rollover,
4540 * and prepending another character if necessary:
4541 *
4542 * s = 0.chr * 3
4543 * s # => "\x00\x00\x00"
4544 * s.succ # => "\x00\x00\x01"
4545 * s = 255.chr * 3
4546 * s # => "\xFF\xFF\xFF"
4547 * s.succ # => "\x01\x00\x00\x00"
4548 *
4549 * Carrying can occur between and among mixtures of alphanumeric characters:
4550 *
4551 * s = 'zz99zz99'
4552 * s.succ # => "aaa00aa00"
4553 * s = '99zz99zz'
4554 * s.succ # => "100aa00aa"
4555 *
4556 * The successor to an empty \String is a new empty \String:
4557 *
4558 * ''.succ # => ""
4559 *
4560 * String#next is an alias for String#succ.
4561 */
4562
4563VALUE
4565{
4566 VALUE str;
4567 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4568 rb_enc_cr_str_copy_for_substr(str, orig);
4569 return str_succ(str);
4570}
4571
4572static VALUE
4573str_succ(VALUE str)
4574{
4575 rb_encoding *enc;
4576 char *sbeg, *s, *e, *last_alnum = 0;
4577 int found_alnum = 0;
4578 long l, slen;
4579 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4580 long carry_pos = 0, carry_len = 1;
4581 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4582
4583 slen = RSTRING_LEN(str);
4584 if (slen == 0) return str;
4585
4586 enc = STR_ENC_GET(str);
4587 sbeg = RSTRING_PTR(str);
4588 s = e = sbeg + slen;
4589
4590 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4591 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4592 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4593 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4594 break;
4595 }
4596 }
4597 l = rb_enc_precise_mbclen(s, e, enc);
4598 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4599 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4600 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4601 switch (neighbor) {
4602 case NEIGHBOR_NOT_CHAR:
4603 continue;
4604 case NEIGHBOR_FOUND:
4605 return str;
4606 case NEIGHBOR_WRAPPED:
4607 last_alnum = s;
4608 break;
4609 }
4610 found_alnum = 1;
4611 carry_pos = s - sbeg;
4612 carry_len = l;
4613 }
4614 if (!found_alnum) { /* str contains no alnum */
4615 s = e;
4616 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4617 enum neighbor_char neighbor;
4618 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4619 l = rb_enc_precise_mbclen(s, e, enc);
4620 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4621 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4622 MEMCPY(tmp, s, char, l);
4623 neighbor = enc_succ_char(tmp, l, enc);
4624 switch (neighbor) {
4625 case NEIGHBOR_FOUND:
4626 MEMCPY(s, tmp, char, l);
4627 return str;
4628 break;
4629 case NEIGHBOR_WRAPPED:
4630 MEMCPY(s, tmp, char, l);
4631 break;
4632 case NEIGHBOR_NOT_CHAR:
4633 break;
4634 }
4635 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4636 /* wrapped to \0...\0. search next valid char. */
4637 enc_succ_char(s, l, enc);
4638 }
4639 if (!rb_enc_asciicompat(enc)) {
4640 MEMCPY(carry, s, char, l);
4641 carry_len = l;
4642 }
4643 carry_pos = s - sbeg;
4644 }
4646 }
4647 RESIZE_CAPA(str, slen + carry_len);
4648 sbeg = RSTRING_PTR(str);
4649 s = sbeg + carry_pos;
4650 memmove(s + carry_len, s, slen - carry_pos);
4651 memmove(s, carry, carry_len);
4652 slen += carry_len;
4653 STR_SET_LEN(str, slen);
4654 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4656 return str;
4657}
4658
4659
4660/*
4661 * call-seq:
4662 * succ! -> self
4663 *
4664 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4665 *
4666 * String#next! is an alias for String#succ!.
4667 */
4668
4669static VALUE
4670rb_str_succ_bang(VALUE str)
4671{
4672 rb_str_modify(str);
4673 str_succ(str);
4674 return str;
4675}
4676
4677static int
4678all_digits_p(const char *s, long len)
4679{
4680 while (len-- > 0) {
4681 if (!ISDIGIT(*s)) return 0;
4682 s++;
4683 }
4684 return 1;
4685}
4686
4687static int
4688str_upto_i(VALUE str, VALUE arg)
4689{
4690 rb_yield(str);
4691 return 0;
4692}
4693
4694/*
4695 * call-seq:
4696 * upto(other_string, exclusive = false) {|string| ... } -> self
4697 * upto(other_string, exclusive = false) -> new_enumerator
4698 *
4699 * With a block given, calls the block with each \String value
4700 * returned by successive calls to String#succ;
4701 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4702 * the sequence terminates when value +other_string+ is reached;
4703 * returns +self+:
4704 *
4705 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4706 * Output:
4707 *
4708 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4709 *
4710 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4711 *
4712 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4713 *
4714 * Output:
4715 *
4716 * a8 a9 b0 b1 b2 b3 b4 b5
4717 *
4718 * If +other_string+ would not be reached, does not call the block:
4719 *
4720 * '25'.upto('5') {|s| fail s }
4721 * 'aa'.upto('a') {|s| fail s }
4722 *
4723 * With no block given, returns a new \Enumerator:
4724 *
4725 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4726 *
4727 */
4728
4729static VALUE
4730rb_str_upto(int argc, VALUE *argv, VALUE beg)
4731{
4732 VALUE end, exclusive;
4733
4734 rb_scan_args(argc, argv, "11", &end, &exclusive);
4735 RETURN_ENUMERATOR(beg, argc, argv);
4736 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4737}
4738
4739VALUE
4740rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4741{
4742 VALUE current, after_end;
4743 ID succ;
4744 int n, ascii;
4745 rb_encoding *enc;
4746
4747 CONST_ID(succ, "succ");
4748 StringValue(end);
4749 enc = rb_enc_check(beg, end);
4750 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4751 /* single character */
4752 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4753 char c = RSTRING_PTR(beg)[0];
4754 char e = RSTRING_PTR(end)[0];
4755
4756 if (c > e || (excl && c == e)) return beg;
4757 for (;;) {
4758 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4759 if (!excl && c == e) break;
4760 c++;
4761 if (excl && c == e) break;
4762 }
4763 return beg;
4764 }
4765 /* both edges are all digits */
4766 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4767 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4768 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4769 VALUE b, e;
4770 int width;
4771
4772 width = RSTRING_LENINT(beg);
4773 b = rb_str_to_inum(beg, 10, FALSE);
4774 e = rb_str_to_inum(end, 10, FALSE);
4775 if (FIXNUM_P(b) && FIXNUM_P(e)) {
4776 long bi = FIX2LONG(b);
4777 long ei = FIX2LONG(e);
4778 rb_encoding *usascii = rb_usascii_encoding();
4779
4780 while (bi <= ei) {
4781 if (excl && bi == ei) break;
4782 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4783 bi++;
4784 }
4785 }
4786 else {
4787 ID op = excl ? '<' : idLE;
4788 VALUE args[2], fmt = rb_fstring_lit("%.*d");
4789
4790 args[0] = INT2FIX(width);
4791 while (rb_funcall(b, op, 1, e)) {
4792 args[1] = b;
4793 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4794 b = rb_funcallv(b, succ, 0, 0);
4795 }
4796 }
4797 return beg;
4798 }
4799 /* normal case */
4800 n = rb_str_cmp(beg, end);
4801 if (n > 0 || (excl && n == 0)) return beg;
4802
4803 after_end = rb_funcallv(end, succ, 0, 0);
4804 current = str_duplicate(rb_cString, beg);
4805 while (!rb_str_equal(current, after_end)) {
4806 VALUE next = Qnil;
4807 if (excl || !rb_str_equal(current, end))
4808 next = rb_funcallv(current, succ, 0, 0);
4809 if ((*each)(current, arg)) break;
4810 if (NIL_P(next)) break;
4811 current = next;
4812 StringValue(current);
4813 if (excl && rb_str_equal(current, end)) break;
4814 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4815 break;
4816 }
4817
4818 return beg;
4819}
4820
4821VALUE
4822rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4823{
4824 VALUE current;
4825 ID succ;
4826
4827 CONST_ID(succ, "succ");
4828 /* both edges are all digits */
4829 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4830 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4831 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4832 int width = RSTRING_LENINT(beg);
4833 b = rb_str_to_inum(beg, 10, FALSE);
4834 if (FIXNUM_P(b)) {
4835 long bi = FIX2LONG(b);
4836 rb_encoding *usascii = rb_usascii_encoding();
4837
4838 while (FIXABLE(bi)) {
4839 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4840 bi++;
4841 }
4842 b = LONG2NUM(bi);
4843 }
4844 args[0] = INT2FIX(width);
4845 while (1) {
4846 args[1] = b;
4847 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4848 b = rb_funcallv(b, succ, 0, 0);
4849 }
4850 }
4851 /* normal case */
4852 current = str_duplicate(rb_cString, beg);
4853 while (1) {
4854 VALUE next = rb_funcallv(current, succ, 0, 0);
4855 if ((*each)(current, arg)) break;
4856 current = next;
4857 StringValue(current);
4858 if (RSTRING_LEN(current) == 0)
4859 break;
4860 }
4861
4862 return beg;
4863}
4864
4865static int
4866include_range_i(VALUE str, VALUE arg)
4867{
4868 VALUE *argp = (VALUE *)arg;
4869 if (!rb_equal(str, *argp)) return 0;
4870 *argp = Qnil;
4871 return 1;
4872}
4873
4874VALUE
4875rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4876{
4877 beg = rb_str_new_frozen(beg);
4878 StringValue(end);
4879 end = rb_str_new_frozen(end);
4880 if (NIL_P(val)) return Qfalse;
4881 val = rb_check_string_type(val);
4882 if (NIL_P(val)) return Qfalse;
4883 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4884 rb_enc_asciicompat(STR_ENC_GET(end)) &&
4885 rb_enc_asciicompat(STR_ENC_GET(val))) {
4886 const char *bp = RSTRING_PTR(beg);
4887 const char *ep = RSTRING_PTR(end);
4888 const char *vp = RSTRING_PTR(val);
4889 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4890 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4891 return Qfalse;
4892 else {
4893 char b = *bp;
4894 char e = *ep;
4895 char v = *vp;
4896
4897 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4898 if (b <= v && v < e) return Qtrue;
4899 return RBOOL(!RTEST(exclusive) && v == e);
4900 }
4901 }
4902 }
4903#if 0
4904 /* both edges are all digits */
4905 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4906 all_digits_p(bp, RSTRING_LEN(beg)) &&
4907 all_digits_p(ep, RSTRING_LEN(end))) {
4908 /* TODO */
4909 }
4910#endif
4911 }
4912 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4913
4914 return RBOOL(NIL_P(val));
4915}
4916
4917static VALUE
4918rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4919{
4920 if (rb_reg_search(re, str, 0, 0) >= 0) {
4921 VALUE match = rb_backref_get();
4922 int nth = rb_reg_backref_number(match, backref);
4923 return rb_reg_nth_match(nth, match);
4924 }
4925 return Qnil;
4926}
4927
4928static VALUE
4929rb_str_aref(VALUE str, VALUE indx)
4930{
4931 long idx;
4932
4933 if (FIXNUM_P(indx)) {
4934 idx = FIX2LONG(indx);
4935 }
4936 else if (RB_TYPE_P(indx, T_REGEXP)) {
4937 return rb_str_subpat(str, indx, INT2FIX(0));
4938 }
4939 else if (RB_TYPE_P(indx, T_STRING)) {
4940 if (rb_str_index(str, indx, 0) != -1)
4941 return str_duplicate(rb_cString, indx);
4942 return Qnil;
4943 }
4944 else {
4945 /* check if indx is Range */
4946 long beg, len = str_strlen(str, NULL);
4947 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4948 case Qfalse:
4949 break;
4950 case Qnil:
4951 return Qnil;
4952 default:
4953 return rb_str_substr(str, beg, len);
4954 }
4955 idx = NUM2LONG(indx);
4956 }
4957
4958 return str_substr(str, idx, 1, FALSE);
4959}
4960
4961
4962/*
4963 * call-seq:
4964 * string[index] -> new_string or nil
4965 * string[start, length] -> new_string or nil
4966 * string[range] -> new_string or nil
4967 * string[regexp, capture = 0] -> new_string or nil
4968 * string[substring] -> new_string or nil
4969 *
4970 * Returns the substring of +self+ specified by the arguments.
4971 *
4972 * When the single \Integer argument +index+ is given,
4973 * returns the 1-character substring found in +self+ at offset +index+:
4974 *
4975 * 'bar'[2] # => "r"
4976 *
4977 * Counts backward from the end of +self+ if +index+ is negative:
4978 *
4979 * 'foo'[-3] # => "f"
4980 *
4981 * Returns +nil+ if +index+ is out of range:
4982 *
4983 * 'foo'[3] # => nil
4984 * 'foo'[-4] # => nil
4985 *
4986 * When the two \Integer arguments +start+ and +length+ are given,
4987 * returns the substring of the given +length+ found in +self+ at offset +start+:
4988 *
4989 * 'foo'[0, 2] # => "fo"
4990 * 'foo'[0, 0] # => ""
4991 *
4992 * Counts backward from the end of +self+ if +start+ is negative:
4993 *
4994 * 'foo'[-2, 2] # => "oo"
4995 *
4996 * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4997 *
4998 * 'foo'[3, 2] # => ""
4999 *
5000 * Returns +nil+ if +start+ is out of range:
5001 *
5002 * 'foo'[4, 2] # => nil
5003 * 'foo'[-4, 2] # => nil
5004 *
5005 * Returns the trailing substring of +self+ if +length+ is large:
5006 *
5007 * 'foo'[1, 50] # => "oo"
5008 *
5009 * Returns +nil+ if +length+ is negative:
5010 *
5011 * 'foo'[0, -1] # => nil
5012 *
5013 * When the single \Range argument +range+ is given,
5014 * derives +start+ and +length+ values from the given +range+,
5015 * and returns values as above:
5016 *
5017 * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5018 * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5019 *
5020 * When the \Regexp argument +regexp+ is given,
5021 * and the +capture+ argument is <tt>0</tt>,
5022 * returns the first matching substring found in +self+,
5023 * or +nil+ if none found:
5024 *
5025 * 'foo'[/o/] # => "o"
5026 * 'foo'[/x/] # => nil
5027 * s = 'hello there'
5028 * s[/[aeiou](.)\1/] # => "ell"
5029 * s[/[aeiou](.)\1/, 0] # => "ell"
5030 *
5031 * If argument +capture+ is given and not <tt>0</tt>,
5032 * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5033 * the method call returns only the specified capture
5034 * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5035 *
5036 * s = 'hello there'
5037 * s[/[aeiou](.)\1/, 1] # => "l"
5038 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5039 * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5040 *
5041 * If an invalid capture group index is given, +nil+ is returned. If an invalid
5042 * capture group name is given, +IndexError+ is raised.
5043 *
5044 * When the single \String argument +substring+ is given,
5045 * returns the substring from +self+ if found, otherwise +nil+:
5046 *
5047 * 'foo'['oo'] # => "oo"
5048 * 'foo'['xx'] # => nil
5049 *
5050 * String#slice is an alias for String#[].
5051 */
5052
5053static VALUE
5054rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5055{
5056 if (argc == 2) {
5057 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5058 return rb_str_subpat(str, argv[0], argv[1]);
5059 }
5060 else {
5061 long beg = NUM2LONG(argv[0]);
5062 long len = NUM2LONG(argv[1]);
5063 return rb_str_substr(str, beg, len);
5064 }
5065 }
5066 rb_check_arity(argc, 1, 2);
5067 return rb_str_aref(str, argv[0]);
5068}
5069
5070VALUE
5072{
5073 char *ptr = RSTRING_PTR(str);
5074 long olen = RSTRING_LEN(str), nlen;
5075
5076 str_modifiable(str);
5077 if (len > olen) len = olen;
5078 nlen = olen - len;
5079 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5080 char *oldptr = ptr;
5081 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5082 STR_SET_EMBED(str);
5083 STR_SET_EMBED_LEN(str, nlen);
5084 ptr = RSTRING(str)->as.embed.ary;
5085 memmove(ptr, oldptr + len, nlen);
5086 if (fl == STR_NOEMBED) xfree(oldptr);
5087 }
5088 else {
5089 if (!STR_SHARED_P(str)) {
5090 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5091 rb_enc_cr_str_exact_copy(shared, str);
5092 OBJ_FREEZE(shared);
5093 }
5094 ptr = RSTRING(str)->as.heap.ptr += len;
5095 RSTRING(str)->as.heap.len = nlen;
5096 }
5097 ptr[nlen] = 0;
5099 return str;
5100}
5101
5102static void
5103rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5104{
5105 char *sptr;
5106 long slen, vlen = RSTRING_LEN(val);
5107 int cr;
5108
5109 if (beg == 0 && vlen == 0) {
5110 rb_str_drop_bytes(str, len);
5111 return;
5112 }
5113
5114 str_modify_keep_cr(str);
5115 RSTRING_GETMEM(str, sptr, slen);
5116 if (len < vlen) {
5117 /* expand string */
5118 RESIZE_CAPA(str, slen + vlen - len);
5119 sptr = RSTRING_PTR(str);
5120 }
5121
5123 cr = rb_enc_str_coderange(val);
5124 else
5126
5127 if (vlen != len) {
5128 memmove(sptr + beg + vlen,
5129 sptr + beg + len,
5130 slen - (beg + len));
5131 }
5132 if (vlen < beg && len < 0) {
5133 MEMZERO(sptr + slen, char, -len);
5134 }
5135 if (vlen > 0) {
5136 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5137 }
5138 slen += vlen - len;
5139 STR_SET_LEN(str, slen);
5140 TERM_FILL(&sptr[slen], TERM_LEN(str));
5141 ENC_CODERANGE_SET(str, cr);
5142}
5143
5144void
5145rb_str_update(VALUE str, long beg, long len, VALUE val)
5146{
5147 long slen;
5148 char *p, *e;
5149 rb_encoding *enc;
5150 int singlebyte = single_byte_optimizable(str);
5151 int cr;
5152
5153 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5154
5155 StringValue(val);
5156 enc = rb_enc_check(str, val);
5157 slen = str_strlen(str, enc); /* rb_enc_check */
5158
5159 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5160 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5161 }
5162 if (beg < 0) {
5163 beg += slen;
5164 }
5165 assert(beg >= 0);
5166 assert(beg <= slen);
5167 if (len > slen - beg) {
5168 len = slen - beg;
5169 }
5170 str_modify_keep_cr(str);
5171 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5172 if (!p) p = RSTRING_END(str);
5173 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5174 if (!e) e = RSTRING_END(str);
5175 /* error check */
5176 beg = p - RSTRING_PTR(str); /* physical position */
5177 len = e - p; /* physical length */
5178 rb_str_splice_0(str, beg, len, val);
5179 rb_enc_associate(str, enc);
5181 if (cr != ENC_CODERANGE_BROKEN)
5182 ENC_CODERANGE_SET(str, cr);
5183}
5184
5185#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5186
5187static void
5188rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5189{
5190 int nth;
5191 VALUE match;
5192 long start, end, len;
5193 rb_encoding *enc;
5194 struct re_registers *regs;
5195
5196 if (rb_reg_search(re, str, 0, 0) < 0) {
5197 rb_raise(rb_eIndexError, "regexp not matched");
5198 }
5199 match = rb_backref_get();
5200 nth = rb_reg_backref_number(match, backref);
5201 regs = RMATCH_REGS(match);
5202 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5203 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5204 }
5205 if (nth < 0) {
5206 nth += regs->num_regs;
5207 }
5208
5209 start = BEG(nth);
5210 if (start == -1) {
5211 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5212 }
5213 end = END(nth);
5214 len = end - start;
5215 StringValue(val);
5216 enc = rb_enc_check_str(str, val);
5217 rb_str_splice_0(str, start, len, val);
5218 rb_enc_associate(str, enc);
5219}
5220
5221static VALUE
5222rb_str_aset(VALUE str, VALUE indx, VALUE val)
5223{
5224 long idx, beg;
5225
5226 switch (TYPE(indx)) {
5227 case T_REGEXP:
5228 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5229 return val;
5230
5231 case T_STRING:
5232 beg = rb_str_index(str, indx, 0);
5233 if (beg < 0) {
5234 rb_raise(rb_eIndexError, "string not matched");
5235 }
5236 beg = rb_str_sublen(str, beg);
5237 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5238 return val;
5239
5240 default:
5241 /* check if indx is Range */
5242 {
5243 long beg, len;
5244 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5245 rb_str_splice(str, beg, len, val);
5246 return val;
5247 }
5248 }
5249 /* FALLTHROUGH */
5250
5251 case T_FIXNUM:
5252 idx = NUM2LONG(indx);
5253 rb_str_splice(str, idx, 1, val);
5254 return val;
5255 }
5256}
5257
5258/*
5259 * call-seq:
5260 * str[integer] = new_str
5261 * str[integer, integer] = new_str
5262 * str[range] = aString
5263 * str[regexp] = new_str
5264 * str[regexp, integer] = new_str
5265 * str[regexp, name] = new_str
5266 * str[other_str] = new_str
5267 *
5268 * Element Assignment---Replaces some or all of the content of
5269 * <i>str</i>. The portion of the string affected is determined using
5270 * the same criteria as String#[]. If the replacement string is not
5271 * the same length as the text it is replacing, the string will be
5272 * adjusted accordingly. If the regular expression or string is used
5273 * as the index doesn't match a position in the string, IndexError is
5274 * raised. If the regular expression form is used, the optional
5275 * second Integer allows you to specify which portion of the match to
5276 * replace (effectively using the MatchData indexing rules. The forms
5277 * that take an Integer will raise an IndexError if the value is out
5278 * of range; the Range form will raise a RangeError, and the Regexp
5279 * and String will raise an IndexError on negative match.
5280 */
5281
5282static VALUE
5283rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5284{
5285 if (argc == 3) {
5286 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5287 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5288 }
5289 else {
5290 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5291 }
5292 return argv[2];
5293 }
5294 rb_check_arity(argc, 2, 3);
5295 return rb_str_aset(str, argv[0], argv[1]);
5296}
5297
5298/*
5299 * call-seq:
5300 * insert(index, other_string) -> self
5301 *
5302 * Inserts the given +other_string+ into +self+; returns +self+.
5303 *
5304 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5305 *
5306 * 'foo'.insert(1, 'bar') # => "fbaroo"
5307 *
5308 * If the \Integer +index+ is negative, counts backward from the end of +self+
5309 * and inserts +other_string+ at offset <tt>index+1</tt>
5310 * (that is, _after_ <tt>self[index]</tt>):
5311 *
5312 * 'foo'.insert(-2, 'bar') # => "fobaro"
5313 *
5314 */
5315
5316static VALUE
5317rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5318{
5319 long pos = NUM2LONG(idx);
5320
5321 if (pos == -1) {
5322 return rb_str_append(str, str2);
5323 }
5324 else if (pos < 0) {
5325 pos++;
5326 }
5327 rb_str_splice(str, pos, 0, str2);
5328 return str;
5329}
5330
5331
5332/*
5333 * call-seq:
5334 * slice!(index) -> new_string or nil
5335 * slice!(start, length) -> new_string or nil
5336 * slice!(range) -> new_string or nil
5337 * slice!(regexp, capture = 0) -> new_string or nil
5338 * slice!(substring) -> new_string or nil
5339 *
5340 * Removes the substring of +self+ specified by the arguments;
5341 * returns the removed substring.
5342 *
5343 * See String#[] for details about the arguments that specify the substring.
5344 *
5345 * A few examples:
5346 *
5347 * string = "This is a string"
5348 * string.slice!(2) #=> "i"
5349 * string.slice!(3..6) #=> " is "
5350 * string.slice!(/s.*t/) #=> "sa st"
5351 * string.slice!("r") #=> "r"
5352 * string #=> "Thing"
5353 *
5354 */
5355
5356static VALUE
5357rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5358{
5359 VALUE result = Qnil;
5360 VALUE indx;
5361 long beg, len = 1;
5362 char *p;
5363
5364 rb_check_arity(argc, 1, 2);
5365 str_modify_keep_cr(str);
5366 indx = argv[0];
5367 if (RB_TYPE_P(indx, T_REGEXP)) {
5368 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5369 VALUE match = rb_backref_get();
5370 struct re_registers *regs = RMATCH_REGS(match);
5371 int nth = 0;
5372 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5373 if ((nth += regs->num_regs) <= 0) return Qnil;
5374 }
5375 else if (nth >= regs->num_regs) return Qnil;
5376 beg = BEG(nth);
5377 len = END(nth) - beg;
5378 goto subseq;
5379 }
5380 else if (argc == 2) {
5381 beg = NUM2LONG(indx);
5382 len = NUM2LONG(argv[1]);
5383 goto num_index;
5384 }
5385 else if (FIXNUM_P(indx)) {
5386 beg = FIX2LONG(indx);
5387 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5388 if (!len) return Qnil;
5389 beg = p - RSTRING_PTR(str);
5390 goto subseq;
5391 }
5392 else if (RB_TYPE_P(indx, T_STRING)) {
5393 beg = rb_str_index(str, indx, 0);
5394 if (beg == -1) return Qnil;
5395 len = RSTRING_LEN(indx);
5396 result = str_duplicate(rb_cString, indx);
5397 goto squash;
5398 }
5399 else {
5400 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5401 case Qnil:
5402 return Qnil;
5403 case Qfalse:
5404 beg = NUM2LONG(indx);
5405 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5406 if (!len) return Qnil;
5407 beg = p - RSTRING_PTR(str);
5408 goto subseq;
5409 default:
5410 goto num_index;
5411 }
5412 }
5413
5414 num_index:
5415 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5416 beg = p - RSTRING_PTR(str);
5417
5418 subseq:
5419 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5420 rb_enc_cr_str_copy_for_substr(result, str);
5421
5422 squash:
5423 if (len > 0) {
5424 if (beg == 0) {
5425 rb_str_drop_bytes(str, len);
5426 }
5427 else {
5428 char *sptr = RSTRING_PTR(str);
5429 long slen = RSTRING_LEN(str);
5430 if (beg + len > slen) /* pathological check */
5431 len = slen - beg;
5432 memmove(sptr + beg,
5433 sptr + beg + len,
5434 slen - (beg + len));
5435 slen -= len;
5436 STR_SET_LEN(str, slen);
5437 TERM_FILL(&sptr[slen], TERM_LEN(str));
5438 }
5439 }
5440 return result;
5441}
5442
5443static VALUE
5444get_pat(VALUE pat)
5445{
5446 VALUE val;
5447
5448 switch (OBJ_BUILTIN_TYPE(pat)) {
5449 case T_REGEXP:
5450 return pat;
5451
5452 case T_STRING:
5453 break;
5454
5455 default:
5456 val = rb_check_string_type(pat);
5457 if (NIL_P(val)) {
5458 Check_Type(pat, T_REGEXP);
5459 }
5460 pat = val;
5461 }
5462
5463 return rb_reg_regcomp(pat);
5464}
5465
5466static VALUE
5467get_pat_quoted(VALUE pat, int check)
5468{
5469 VALUE val;
5470
5471 switch (OBJ_BUILTIN_TYPE(pat)) {
5472 case T_REGEXP:
5473 return pat;
5474
5475 case T_STRING:
5476 break;
5477
5478 default:
5479 val = rb_check_string_type(pat);
5480 if (NIL_P(val)) {
5481 Check_Type(pat, T_REGEXP);
5482 }
5483 pat = val;
5484 }
5485 if (check && is_broken_string(pat)) {
5486 rb_exc_raise(rb_reg_check_preprocess(pat));
5487 }
5488 return pat;
5489}
5490
5491static long
5492rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5493{
5494 if (BUILTIN_TYPE(pat) == T_STRING) {
5495 pos = rb_strseq_index(str, pat, pos, 1);
5496 if (set_backref_str) {
5497 if (pos >= 0) {
5498 str = rb_str_new_frozen_String(str);
5499 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5500 }
5501 else {
5503 }
5504 }
5505 return pos;
5506 }
5507 else {
5508 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5509 }
5510}
5511
5512
5513/*
5514 * call-seq:
5515 * sub!(pattern, replacement) -> self or nil
5516 * sub!(pattern) {|match| ... } -> self or nil
5517 *
5518 * Returns +self+ with only the first occurrence
5519 * (not all occurrences) of the given +pattern+ replaced.
5520 *
5521 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5522 *
5523 * Related: String#sub, String#gsub, String#gsub!.
5524 *
5525 */
5526
5527static VALUE
5528rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5529{
5530 VALUE pat, repl, hash = Qnil;
5531 int iter = 0;
5532 long plen;
5533 int min_arity = rb_block_given_p() ? 1 : 2;
5534 long beg;
5535
5536 rb_check_arity(argc, min_arity, 2);
5537 if (argc == 1) {
5538 iter = 1;
5539 }
5540 else {
5541 repl = argv[1];
5542 hash = rb_check_hash_type(argv[1]);
5543 if (NIL_P(hash)) {
5544 StringValue(repl);
5545 }
5546 }
5547
5548 pat = get_pat_quoted(argv[0], 1);
5549
5550 str_modifiable(str);
5551 beg = rb_pat_search(pat, str, 0, 1);
5552 if (beg >= 0) {
5553 rb_encoding *enc;
5554 int cr = ENC_CODERANGE(str);
5555 long beg0, end0;
5556 VALUE match, match0 = Qnil;
5557 struct re_registers *regs;
5558 char *p, *rp;
5559 long len, rlen;
5560
5561 match = rb_backref_get();
5562 regs = RMATCH_REGS(match);
5563 if (RB_TYPE_P(pat, T_STRING)) {
5564 beg0 = beg;
5565 end0 = beg0 + RSTRING_LEN(pat);
5566 match0 = pat;
5567 }
5568 else {
5569 beg0 = BEG(0);
5570 end0 = END(0);
5571 if (iter) match0 = rb_reg_nth_match(0, match);
5572 }
5573
5574 if (iter || !NIL_P(hash)) {
5575 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5576
5577 if (iter) {
5578 repl = rb_obj_as_string(rb_yield(match0));
5579 }
5580 else {
5581 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5582 repl = rb_obj_as_string(repl);
5583 }
5584 str_mod_check(str, p, len);
5585 rb_check_frozen(str);
5586 }
5587 else {
5588 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5589 }
5590
5591 enc = rb_enc_compatible(str, repl);
5592 if (!enc) {
5593 rb_encoding *str_enc = STR_ENC_GET(str);
5594 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5595 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5596 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5597 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5598 rb_enc_name(str_enc),
5599 rb_enc_name(STR_ENC_GET(repl)));
5600 }
5601 enc = STR_ENC_GET(repl);
5602 }
5603 rb_str_modify(str);
5604 rb_enc_associate(str, enc);
5606 int cr2 = ENC_CODERANGE(repl);
5607 if (cr2 == ENC_CODERANGE_BROKEN ||
5608 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5610 else
5611 cr = cr2;
5612 }
5613 plen = end0 - beg0;
5614 rlen = RSTRING_LEN(repl);
5615 len = RSTRING_LEN(str);
5616 if (rlen > plen) {
5617 RESIZE_CAPA(str, len + rlen - plen);
5618 }
5619 p = RSTRING_PTR(str);
5620 if (rlen != plen) {
5621 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5622 }
5623 rp = RSTRING_PTR(repl);
5624 memmove(p + beg0, rp, rlen);
5625 len += rlen - plen;
5626 STR_SET_LEN(str, len);
5627 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5628 ENC_CODERANGE_SET(str, cr);
5629
5630 return str;
5631 }
5632 return Qnil;
5633}
5634
5635
5636/*
5637 * call-seq:
5638 * sub(pattern, replacement) -> new_string
5639 * sub(pattern) {|match| ... } -> new_string
5640 *
5641 * Returns a copy of +self+ with only the first occurrence
5642 * (not all occurrences) of the given +pattern+ replaced.
5643 *
5644 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5645 *
5646 * Related: String#sub!, String#gsub, String#gsub!.
5647 *
5648 */
5649
5650static VALUE
5651rb_str_sub(int argc, VALUE *argv, VALUE str)
5652{
5653 str = str_duplicate(rb_cString, str);
5654 rb_str_sub_bang(argc, argv, str);
5655 return str;
5656}
5657
5658static VALUE
5659str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5660{
5661 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5662 struct re_registers *regs;
5663 long beg, beg0, end0;
5664 long offset, blen, slen, len, last;
5665 enum {STR, ITER, MAP} mode = STR;
5666 char *sp, *cp;
5667 int need_backref = -1;
5668 rb_encoding *str_enc;
5669
5670 switch (argc) {
5671 case 1:
5672 RETURN_ENUMERATOR(str, argc, argv);
5673 mode = ITER;
5674 break;
5675 case 2:
5676 repl = argv[1];
5677 hash = rb_check_hash_type(argv[1]);
5678 if (NIL_P(hash)) {
5679 StringValue(repl);
5680 }
5681 else {
5682 mode = MAP;
5683 }
5684 break;
5685 default:
5686 rb_error_arity(argc, 1, 2);
5687 }
5688
5689 pat = get_pat_quoted(argv[0], 1);
5690 beg = rb_pat_search(pat, str, 0, need_backref);
5691 if (beg < 0) {
5692 if (bang) return Qnil; /* no match, no substitution */
5693 return str_duplicate(rb_cString, str);
5694 }
5695
5696 offset = 0;
5697 blen = RSTRING_LEN(str) + 30; /* len + margin */
5698 dest = rb_str_buf_new(blen);
5699 sp = RSTRING_PTR(str);
5700 slen = RSTRING_LEN(str);
5701 cp = sp;
5702 str_enc = STR_ENC_GET(str);
5703 rb_enc_associate(dest, str_enc);
5705
5706 do {
5707 match = rb_backref_get();
5708 regs = RMATCH_REGS(match);
5709 if (RB_TYPE_P(pat, T_STRING)) {
5710 beg0 = beg;
5711 end0 = beg0 + RSTRING_LEN(pat);
5712 match0 = pat;
5713 }
5714 else {
5715 beg0 = BEG(0);
5716 end0 = END(0);
5717 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5718 }
5719
5720 if (mode) {
5721 if (mode == ITER) {
5722 val = rb_obj_as_string(rb_yield(match0));
5723 }
5724 else {
5725 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5726 val = rb_obj_as_string(val);
5727 }
5728 str_mod_check(str, sp, slen);
5729 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5730 rb_raise(rb_eRuntimeError, "block should not cheat");
5731 }
5732 }
5733 else if (need_backref) {
5734 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5735 if (need_backref < 0) {
5736 need_backref = val != repl;
5737 }
5738 }
5739 else {
5740 val = repl;
5741 }
5742
5743 len = beg0 - offset; /* copy pre-match substr */
5744 if (len) {
5745 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5746 }
5747
5748 rb_str_buf_append(dest, val);
5749
5750 last = offset;
5751 offset = end0;
5752 if (beg0 == end0) {
5753 /*
5754 * Always consume at least one character of the input string
5755 * in order to prevent infinite loops.
5756 */
5757 if (RSTRING_LEN(str) <= end0) break;
5758 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5759 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5760 offset = end0 + len;
5761 }
5762 cp = RSTRING_PTR(str) + offset;
5763 if (offset > RSTRING_LEN(str)) break;
5764 beg = rb_pat_search(pat, str, offset, need_backref);
5765 } while (beg >= 0);
5766 if (RSTRING_LEN(str) > offset) {
5767 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5768 }
5769 rb_pat_search(pat, str, last, 1);
5770 if (bang) {
5771 str_shared_replace(str, dest);
5772 }
5773 else {
5774 str = dest;
5775 }
5776
5777 return str;
5778}
5779
5780
5781/*
5782 * call-seq:
5783 * gsub!(pattern, replacement) -> self or nil
5784 * gsub!(pattern) {|match| ... } -> self or nil
5785 * gsub!(pattern) -> an_enumerator
5786 *
5787 * Performs the specified substring replacement(s) on +self+;
5788 * returns +self+ if any replacement occurred, +nil+ otherwise.
5789 *
5790 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5791 *
5792 * Returns an Enumerator if no +replacement+ and no block given.
5793 *
5794 * Related: String#sub, String#gsub, String#sub!.
5795 *
5796 */
5797
5798static VALUE
5799rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5800{
5801 str_modify_keep_cr(str);
5802 return str_gsub(argc, argv, str, 1);
5803}
5804
5805
5806/*
5807 * call-seq:
5808 * gsub(pattern, replacement) -> new_string
5809 * gsub(pattern) {|match| ... } -> new_string
5810 * gsub(pattern) -> enumerator
5811 *
5812 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5813 *
5814 * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5815 *
5816 * Returns an Enumerator if no +replacement+ and no block given.
5817 *
5818 * Related: String#sub, String#sub!, String#gsub!.
5819 *
5820 */
5821
5822static VALUE
5823rb_str_gsub(int argc, VALUE *argv, VALUE str)
5824{
5825 return str_gsub(argc, argv, str, 0);
5826}
5827
5828
5829/*
5830 * call-seq:
5831 * replace(other_string) -> self
5832 *
5833 * Replaces the contents of +self+ with the contents of +other_string+:
5834 *
5835 * s = 'foo' # => "foo"
5836 * s.replace('bar') # => "bar"
5837 *
5838 */
5839
5840VALUE
5842{
5843 str_modifiable(str);
5844 if (str == str2) return str;
5845
5846 StringValue(str2);
5847 str_discard(str);
5848 return str_replace(str, str2);
5849}
5850
5851/*
5852 * call-seq:
5853 * clear -> self
5854 *
5855 * Removes the contents of +self+:
5856 *
5857 * s = 'foo' # => "foo"
5858 * s.clear # => ""
5859 *
5860 */
5861
5862static VALUE
5863rb_str_clear(VALUE str)
5864{
5865 str_discard(str);
5866 STR_SET_EMBED(str);
5867 STR_SET_EMBED_LEN(str, 0);
5868 RSTRING_PTR(str)[0] = 0;
5869 if (rb_enc_asciicompat(STR_ENC_GET(str)))
5871 else
5873 return str;
5874}
5875
5876/*
5877 * call-seq:
5878 * chr -> string
5879 *
5880 * Returns a string containing the first character of +self+:
5881 *
5882 * s = 'foo' # => "foo"
5883 * s.chr # => "f"
5884 *
5885 */
5886
5887static VALUE
5888rb_str_chr(VALUE str)
5889{
5890 return rb_str_substr(str, 0, 1);
5891}
5892
5893/*
5894 * call-seq:
5895 * getbyte(index) -> integer
5896 *
5897 * Returns the byte at zero-based +index+ as an integer:
5898 *
5899 * s = 'abcde' # => "abcde"
5900 * s.getbyte(0) # => 97
5901 * s.getbyte(1) # => 98
5902 *
5903 * Related: String#setbyte.
5904 */
5905static VALUE
5906rb_str_getbyte(VALUE str, VALUE index)
5907{
5908 long pos = NUM2LONG(index);
5909
5910 if (pos < 0)
5911 pos += RSTRING_LEN(str);
5912 if (pos < 0 || RSTRING_LEN(str) <= pos)
5913 return Qnil;
5914
5915 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5916}
5917
5918/*
5919 * call-seq:
5920 * setbyte(index, integer) -> integer
5921 *
5922 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5923 *
5924 * s = 'abcde' # => "abcde"
5925 * s.setbyte(0, 98) # => 98
5926 * s # => "bbcde"
5927 *
5928 * Related: String#getbyte.
5929 */
5930static VALUE
5931rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5932{
5933 long pos = NUM2LONG(index);
5934 long len = RSTRING_LEN(str);
5935 char *ptr, *head, *left = 0;
5936 rb_encoding *enc;
5937 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5938
5939 if (pos < -len || len <= pos)
5940 rb_raise(rb_eIndexError, "index %ld out of string", pos);
5941 if (pos < 0)
5942 pos += len;
5943
5944 VALUE v = rb_to_int(value);
5945 VALUE w = rb_int_and(v, INT2FIX(0xff));
5946 char byte = (char)(NUM2INT(w) & 0xFF);
5947
5948 if (!str_independent(str))
5949 str_make_independent(str);
5950 enc = STR_ENC_GET(str);
5951 head = RSTRING_PTR(str);
5952 ptr = &head[pos];
5953 if (!STR_EMBED_P(str)) {
5954 cr = ENC_CODERANGE(str);
5955 switch (cr) {
5956 case ENC_CODERANGE_7BIT:
5957 left = ptr;
5958 *ptr = byte;
5959 if (ISASCII(byte)) goto end;
5960 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5961 if (!MBCLEN_CHARFOUND_P(nlen))
5963 else
5965 goto end;
5967 left = rb_enc_left_char_head(head, ptr, head+len, enc);
5968 width = rb_enc_precise_mbclen(left, head+len, enc);
5969 *ptr = byte;
5970 nlen = rb_enc_precise_mbclen(left, head+len, enc);
5971 if (!MBCLEN_CHARFOUND_P(nlen))
5973 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5975 goto end;
5976 }
5977 }
5979 *ptr = byte;
5980
5981 end:
5982 return value;
5983}
5984
5985static VALUE
5986str_byte_substr(VALUE str, long beg, long len, int empty)
5987{
5988 char *p, *s = RSTRING_PTR(str);
5989 long n = RSTRING_LEN(str);
5990 VALUE str2;
5991
5992 if (beg > n || len < 0) return Qnil;
5993 if (beg < 0) {
5994 beg += n;
5995 if (beg < 0) return Qnil;
5996 }
5997 if (len > n - beg)
5998 len = n - beg;
5999 if (len <= 0) {
6000 if (!empty) return Qnil;
6001 len = 0;
6002 p = 0;
6003 }
6004 else
6005 p = s + beg;
6006
6007 if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
6008 str2 = rb_str_new_frozen(str);
6009 str2 = str_new_shared(rb_cString, str2);
6010 RSTRING(str2)->as.heap.ptr += beg;
6011 RSTRING(str2)->as.heap.len = len;
6012 }
6013 else {
6014 str2 = rb_str_new(p, len);
6015 }
6016
6017 str_enc_copy(str2, str);
6018
6019 if (RSTRING_LEN(str2) == 0) {
6020 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6022 else
6024 }
6025 else {
6026 switch (ENC_CODERANGE(str)) {
6027 case ENC_CODERANGE_7BIT:
6029 break;
6030 default:
6032 break;
6033 }
6034 }
6035
6036 return str2;
6037}
6038
6039static VALUE
6040str_byte_aref(VALUE str, VALUE indx)
6041{
6042 long idx;
6043 if (FIXNUM_P(indx)) {
6044 idx = FIX2LONG(indx);
6045 }
6046 else {
6047 /* check if indx is Range */
6048 long beg, len = RSTRING_LEN(str);
6049
6050 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6051 case Qfalse:
6052 break;
6053 case Qnil:
6054 return Qnil;
6055 default:
6056 return str_byte_substr(str, beg, len, TRUE);
6057 }
6058
6059 idx = NUM2LONG(indx);
6060 }
6061 return str_byte_substr(str, idx, 1, FALSE);
6062}
6063
6064/*
6065 * call-seq:
6066 * byteslice(index, length = 1) -> string or nil
6067 * byteslice(range) -> string or nil
6068 *
6069 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6070 *
6071 * With integer arguments +index+ and +length+ given,
6072 * returns the substring beginning at the given +index+
6073 * of the given +length+ (if possible),
6074 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6075 *
6076 * s = '0123456789' # => "0123456789"
6077 * s.byteslice(2) # => "2"
6078 * s.byteslice(200) # => nil
6079 * s.byteslice(4, 3) # => "456"
6080 * s.byteslice(4, 30) # => "456789"
6081 * s.byteslice(4, -1) # => nil
6082 * s.byteslice(40, 2) # => nil
6083 *
6084 * In either case above, counts backwards from the end of +self+
6085 * if +index+ is negative:
6086 *
6087 * s = '0123456789' # => "0123456789"
6088 * s.byteslice(-4) # => "6"
6089 * s.byteslice(-4, 3) # => "678"
6090 *
6091 * With Range argument +range+ given, returns
6092 * <tt>byteslice(range.begin, range.size)</tt>:
6093 *
6094 * s = '0123456789' # => "0123456789"
6095 * s.byteslice(4..6) # => "456"
6096 * s.byteslice(-6..-4) # => "456"
6097 * s.byteslice(5..2) # => "" # range.size is zero.
6098 * s.byteslice(40..42) # => nil
6099 *
6100 * In all cases, a returned string has the same encoding as +self+:
6101 *
6102 * s.encoding # => #<Encoding:UTF-8>
6103 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6104 *
6105 */
6106
6107static VALUE
6108rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6109{
6110 if (argc == 2) {
6111 long beg = NUM2LONG(argv[0]);
6112 long end = NUM2LONG(argv[1]);
6113 return str_byte_substr(str, beg, end, TRUE);
6114 }
6115 rb_check_arity(argc, 1, 2);
6116 return str_byte_aref(str, argv[0]);
6117}
6118
6119/*
6120 * call-seq:
6121 * reverse -> string
6122 *
6123 * Returns a new string with the characters from +self+ in reverse order.
6124 *
6125 * 'stressed'.reverse # => "desserts"
6126 *
6127 */
6128
6129static VALUE
6130rb_str_reverse(VALUE str)
6131{
6132 rb_encoding *enc;
6133 VALUE rev;
6134 char *s, *e, *p;
6135 int cr;
6136
6137 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6138 enc = STR_ENC_GET(str);
6139 rev = rb_str_new(0, RSTRING_LEN(str));
6140 s = RSTRING_PTR(str); e = RSTRING_END(str);
6141 p = RSTRING_END(rev);
6142 cr = ENC_CODERANGE(str);
6143
6144 if (RSTRING_LEN(str) > 1) {
6145 if (single_byte_optimizable(str)) {
6146 while (s < e) {
6147 *--p = *s++;
6148 }
6149 }
6150 else if (cr == ENC_CODERANGE_VALID) {
6151 while (s < e) {
6152 int clen = rb_enc_fast_mbclen(s, e, enc);
6153
6154 p -= clen;
6155 memcpy(p, s, clen);
6156 s += clen;
6157 }
6158 }
6159 else {
6160 cr = rb_enc_asciicompat(enc) ?
6162 while (s < e) {
6163 int clen = rb_enc_mbclen(s, e, enc);
6164
6165 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6166 p -= clen;
6167 memcpy(p, s, clen);
6168 s += clen;
6169 }
6170 }
6171 }
6172 STR_SET_LEN(rev, RSTRING_LEN(str));
6173 str_enc_copy(rev, str);
6174 ENC_CODERANGE_SET(rev, cr);
6175
6176 return rev;
6177}
6178
6179
6180/*
6181 * call-seq:
6182 * reverse! -> self
6183 *
6184 * Returns +self+ with its characters reversed:
6185 *
6186 * s = 'stressed'
6187 * s.reverse! # => "desserts"
6188 * s # => "desserts"
6189 *
6190 */
6191
6192static VALUE
6193rb_str_reverse_bang(VALUE str)
6194{
6195 if (RSTRING_LEN(str) > 1) {
6196 if (single_byte_optimizable(str)) {
6197 char *s, *e, c;
6198
6199 str_modify_keep_cr(str);
6200 s = RSTRING_PTR(str);
6201 e = RSTRING_END(str) - 1;
6202 while (s < e) {
6203 c = *s;
6204 *s++ = *e;
6205 *e-- = c;
6206 }
6207 }
6208 else {
6209 str_shared_replace(str, rb_str_reverse(str));
6210 }
6211 }
6212 else {
6213 str_modify_keep_cr(str);
6214 }
6215 return str;
6216}
6217
6218
6219/*
6220 * call-seq:
6221 * include? other_string -> true or false
6222 *
6223 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6224 *
6225 * s = 'foo'
6226 * s.include?('f') # => true
6227 * s.include?('fo') # => true
6228 * s.include?('food') # => false
6229 *
6230 */
6231
6232static VALUE
6233rb_str_include(VALUE str, VALUE arg)
6234{
6235 long i;
6236
6237 StringValue(arg);
6238 i = rb_str_index(str, arg, 0);
6239
6240 return RBOOL(i != -1);
6241}
6242
6243
6244/*
6245 * call-seq:
6246 * to_i(base = 10) -> integer
6247 *
6248 * Returns the result of interpreting leading characters in +self+
6249 * as an integer in the given +base+ (which must be in (2..36)):
6250 *
6251 * '123456'.to_i # => 123456
6252 * '123def'.to_i(16) # => 1195503
6253 *
6254 * Characters past a leading valid number (in the given +base+) are ignored:
6255 *
6256 * '12.345'.to_i # => 12
6257 * '12345'.to_i(2) # => 1
6258 *
6259 * Returns zero if there is no leading valid number:
6260 *
6261 * 'abcdef'.to_i # => 0
6262 * '2'.to_i(2) # => 0
6263 *
6264 */
6265
6266static VALUE
6267rb_str_to_i(int argc, VALUE *argv, VALUE str)
6268{
6269 int base = 10;
6270
6271 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6272 rb_raise(rb_eArgError, "invalid radix %d", base);
6273 }
6274 return rb_str_to_inum(str, base, FALSE);
6275}
6276
6277
6278/*
6279 * call-seq:
6280 * to_f -> float
6281 *
6282 * Returns the result of interpreting leading characters in +self+ as a Float:
6283 *
6284 * '3.14159'.to_f # => 3.14159
6285 '1.234e-2'.to_f # => 0.01234
6286 *
6287 * Characters past a leading valid number (in the given +base+) are ignored:
6288 *
6289 * '3.14 (pi to two places)'.to_f # => 3.14
6290 *
6291 * Returns zero if there is no leading valid number:
6292 *
6293 * 'abcdef'.to_f # => 0.0
6294 *
6295 */
6296
6297static VALUE
6298rb_str_to_f(VALUE str)
6299{
6300 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6301}
6302
6303
6304/*
6305 * call-seq:
6306 * to_s -> self or string
6307 *
6308 * Returns +self+ if +self+ is a \String,
6309 * or +self+ converted to a \String if +self+ is a subclass of \String.
6310 *
6311 * String#to_str is an alias for String#to_s.
6312 *
6313 */
6314
6315static VALUE
6316rb_str_to_s(VALUE str)
6317{
6318 if (rb_obj_class(str) != rb_cString) {
6319 return str_duplicate(rb_cString, str);
6320 }
6321 return str;
6322}
6323
6324#if 0
6325static void
6326str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6327{
6328 char s[RUBY_MAX_CHAR_LEN];
6329 int n = rb_enc_codelen(c, enc);
6330
6331 rb_enc_mbcput(c, s, enc);
6332 rb_enc_str_buf_cat(str, s, n, enc);
6333}
6334#endif
6335
6336#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6337
6338int
6339rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6340{
6341 char buf[CHAR_ESC_LEN + 1];
6342 int l;
6343
6344#if SIZEOF_INT > 4
6345 c &= 0xffffffff;
6346#endif
6347 if (unicode_p) {
6348 if (c < 0x7F && ISPRINT(c)) {
6349 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6350 }
6351 else if (c < 0x10000) {
6352 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6353 }
6354 else {
6355 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6356 }
6357 }
6358 else {
6359 if (c < 0x100) {
6360 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6361 }
6362 else {
6363 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6364 }
6365 }
6366 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6367 rb_str_buf_cat(result, buf, l);
6368 return l;
6369}
6370
6371const char *
6372ruby_escaped_char(int c)
6373{
6374 switch (c) {
6375 case '\0': return "\\0";
6376 case '\n': return "\\n";
6377 case '\r': return "\\r";
6378 case '\t': return "\\t";
6379 case '\f': return "\\f";
6380 case '\013': return "\\v";
6381 case '\010': return "\\b";
6382 case '\007': return "\\a";
6383 case '\033': return "\\e";
6384 case '\x7f': return "\\c?";
6385 }
6386 return NULL;
6387}
6388
6389VALUE
6390rb_str_escape(VALUE str)
6391{
6392 int encidx = ENCODING_GET(str);
6393 rb_encoding *enc = rb_enc_from_index(encidx);
6394 const char *p = RSTRING_PTR(str);
6395 const char *pend = RSTRING_END(str);
6396 const char *prev = p;
6397 char buf[CHAR_ESC_LEN + 1];
6398 VALUE result = rb_str_buf_new(0);
6399 int unicode_p = rb_enc_unicode_p(enc);
6400 int asciicompat = rb_enc_asciicompat(enc);
6401
6402 while (p < pend) {
6403 unsigned int c;
6404 const char *cc;
6405 int n = rb_enc_precise_mbclen(p, pend, enc);
6406 if (!MBCLEN_CHARFOUND_P(n)) {
6407 if (p > prev) str_buf_cat(result, prev, p - prev);
6408 n = rb_enc_mbminlen(enc);
6409 if (pend < p + n)
6410 n = (int)(pend - p);
6411 while (n--) {
6412 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6413 str_buf_cat(result, buf, strlen(buf));
6414 prev = ++p;
6415 }
6416 continue;
6417 }
6418 n = MBCLEN_CHARFOUND_LEN(n);
6419 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6420 p += n;
6421 cc = ruby_escaped_char(c);
6422 if (cc) {
6423 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6424 str_buf_cat(result, cc, strlen(cc));
6425 prev = p;
6426 }
6427 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6428 }
6429 else {
6430 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6431 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6432 prev = p;
6433 }
6434 }
6435 if (p > prev) str_buf_cat(result, prev, p - prev);
6436 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6437
6438 return result;
6439}
6440
6441/*
6442 * call-seq:
6443 * inspect -> string
6444 *
6445 * Returns a printable version of +self+, enclosed in double-quotes,
6446 * and with special characters escaped:
6447 *
6448 * s = "foo\tbar\tbaz\n"
6449 * # => "foo\tbar\tbaz\n"
6450 * s.inspect
6451 * # => "\"foo\\tbar\\tbaz\\n\""
6452 *
6453 */
6454
6455VALUE
6457{
6458 int encidx = ENCODING_GET(str);
6459 rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6460 const char *p, *pend, *prev;
6461 char buf[CHAR_ESC_LEN + 1];
6462 VALUE result = rb_str_buf_new(0);
6463 rb_encoding *resenc = rb_default_internal_encoding();
6464 int unicode_p = rb_enc_unicode_p(enc);
6465 int asciicompat = rb_enc_asciicompat(enc);
6466
6467 if (resenc == NULL) resenc = rb_default_external_encoding();
6468 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6469 rb_enc_associate(result, resenc);
6470 str_buf_cat2(result, "\"");
6471
6472 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6473 prev = p;
6474 actenc = get_actual_encoding(encidx, str);
6475 if (actenc != enc) {
6476 enc = actenc;
6477 if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6478 }
6479 while (p < pend) {
6480 unsigned int c, cc;
6481 int n;
6482
6483 n = rb_enc_precise_mbclen(p, pend, enc);
6484 if (!MBCLEN_CHARFOUND_P(n)) {
6485 if (p > prev) str_buf_cat(result, prev, p - prev);
6486 n = rb_enc_mbminlen(enc);
6487 if (pend < p + n)
6488 n = (int)(pend - p);
6489 while (n--) {
6490 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6491 str_buf_cat(result, buf, strlen(buf));
6492 prev = ++p;
6493 }
6494 continue;
6495 }
6496 n = MBCLEN_CHARFOUND_LEN(n);
6497 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6498 p += n;
6499 if ((asciicompat || unicode_p) &&
6500 (c == '"'|| c == '\\' ||
6501 (c == '#' &&
6502 p < pend &&
6503 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6504 (cc = rb_enc_codepoint(p,pend,enc),
6505 (cc == '$' || cc == '@' || cc == '{'))))) {
6506 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6507 str_buf_cat2(result, "\\");
6508 if (asciicompat || enc == resenc) {
6509 prev = p - n;
6510 continue;
6511 }
6512 }
6513 switch (c) {
6514 case '\n': cc = 'n'; break;
6515 case '\r': cc = 'r'; break;
6516 case '\t': cc = 't'; break;
6517 case '\f': cc = 'f'; break;
6518 case '\013': cc = 'v'; break;
6519 case '\010': cc = 'b'; break;
6520 case '\007': cc = 'a'; break;
6521 case 033: cc = 'e'; break;
6522 default: cc = 0; break;
6523 }
6524 if (cc) {
6525 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6526 buf[0] = '\\';
6527 buf[1] = (char)cc;
6528 str_buf_cat(result, buf, 2);
6529 prev = p;
6530 continue;
6531 }
6532 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6533 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6534 continue;
6535 }
6536 else {
6537 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6538 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6539 prev = p;
6540 continue;
6541 }
6542 }
6543 if (p > prev) str_buf_cat(result, prev, p - prev);
6544 str_buf_cat2(result, "\"");
6545
6546 return result;
6547}
6548
6549#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6550
6551/*
6552 * call-seq:
6553 * dump -> string
6554 *
6555 * Returns a printable version of +self+, enclosed in double-quotes,
6556 * with special characters escaped, and with non-printing characters
6557 * replaced by hexadecimal notation:
6558 *
6559 * "hello \n ''".dump # => "\"hello \\n ''\""
6560 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6561 *
6562 * Related: String#undump (inverse of String#dump).
6563 *
6564 */
6565
6566VALUE
6568{
6569 int encidx = rb_enc_get_index(str);
6570 rb_encoding *enc = rb_enc_from_index(encidx);
6571 long len;
6572 const char *p, *pend;
6573 char *q, *qend;
6574 VALUE result;
6575 int u8 = (encidx == rb_utf8_encindex());
6576 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6577
6578 len = 2; /* "" */
6579 if (!rb_enc_asciicompat(enc)) {
6580 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6581 len += strlen(enc->name);
6582 }
6583
6584 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6585 while (p < pend) {
6586 int clen;
6587 unsigned char c = *p++;
6588
6589 switch (c) {
6590 case '"': case '\\':
6591 case '\n': case '\r':
6592 case '\t': case '\f':
6593 case '\013': case '\010': case '\007': case '\033':
6594 clen = 2;
6595 break;
6596
6597 case '#':
6598 clen = IS_EVSTR(p, pend) ? 2 : 1;
6599 break;
6600
6601 default:
6602 if (ISPRINT(c)) {
6603 clen = 1;
6604 }
6605 else {
6606 if (u8 && c > 0x7F) { /* \u notation */
6607 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6608 if (MBCLEN_CHARFOUND_P(n)) {
6609 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6610 if (cc <= 0xFFFF)
6611 clen = 6; /* \uXXXX */
6612 else if (cc <= 0xFFFFF)
6613 clen = 9; /* \u{XXXXX} */
6614 else
6615 clen = 10; /* \u{XXXXXX} */
6616 p += MBCLEN_CHARFOUND_LEN(n)-1;
6617 break;
6618 }
6619 }
6620 clen = 4; /* \xNN */
6621 }
6622 break;
6623 }
6624
6625 if (clen > LONG_MAX - len) {
6626 rb_raise(rb_eRuntimeError, "string size too big");
6627 }
6628 len += clen;
6629 }
6630
6631 result = rb_str_new(0, len);
6632 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6633 q = RSTRING_PTR(result); qend = q + len + 1;
6634
6635 *q++ = '"';
6636 while (p < pend) {
6637 unsigned char c = *p++;
6638
6639 if (c == '"' || c == '\\') {
6640 *q++ = '\\';
6641 *q++ = c;
6642 }
6643 else if (c == '#') {
6644 if (IS_EVSTR(p, pend)) *q++ = '\\';
6645 *q++ = '#';
6646 }
6647 else if (c == '\n') {
6648 *q++ = '\\';
6649 *q++ = 'n';
6650 }
6651 else if (c == '\r') {
6652 *q++ = '\\';
6653 *q++ = 'r';
6654 }
6655 else if (c == '\t') {
6656 *q++ = '\\';
6657 *q++ = 't';
6658 }
6659 else if (c == '\f') {
6660 *q++ = '\\';
6661 *q++ = 'f';
6662 }
6663 else if (c == '\013') {
6664 *q++ = '\\';
6665 *q++ = 'v';
6666 }
6667 else if (c == '\010') {
6668 *q++ = '\\';
6669 *q++ = 'b';
6670 }
6671 else if (c == '\007') {
6672 *q++ = '\\';
6673 *q++ = 'a';
6674 }
6675 else if (c == '\033') {
6676 *q++ = '\\';
6677 *q++ = 'e';
6678 }
6679 else if (ISPRINT(c)) {
6680 *q++ = c;
6681 }
6682 else {
6683 *q++ = '\\';
6684 if (u8) {
6685 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6686 if (MBCLEN_CHARFOUND_P(n)) {
6687 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6688 p += n;
6689 if (cc <= 0xFFFF)
6690 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6691 else
6692 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6693 q += strlen(q);
6694 continue;
6695 }
6696 }
6697 snprintf(q, qend-q, "x%02X", c);
6698 q += 3;
6699 }
6700 }
6701 *q++ = '"';
6702 *q = '\0';
6703 if (!rb_enc_asciicompat(enc)) {
6704 snprintf(q, qend-q, nonascii_suffix, enc->name);
6705 encidx = rb_ascii8bit_encindex();
6706 }
6707 /* result from dump is ASCII */
6708 rb_enc_associate_index(result, encidx);
6710 return result;
6711}
6712
6713static int
6714unescape_ascii(unsigned int c)
6715{
6716 switch (c) {
6717 case 'n':
6718 return '\n';
6719 case 'r':
6720 return '\r';
6721 case 't':
6722 return '\t';
6723 case 'f':
6724 return '\f';
6725 case 'v':
6726 return '\13';
6727 case 'b':
6728 return '\010';
6729 case 'a':
6730 return '\007';
6731 case 'e':
6732 return 033;
6733 }
6735}
6736
6737static void
6738undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6739{
6740 const char *s = *ss;
6741 unsigned int c;
6742 int codelen;
6743 size_t hexlen;
6744 unsigned char buf[6];
6745 static rb_encoding *enc_utf8 = NULL;
6746
6747 switch (*s) {
6748 case '\\':
6749 case '"':
6750 case '#':
6751 rb_str_cat(undumped, s, 1); /* cat itself */
6752 s++;
6753 break;
6754 case 'n':
6755 case 'r':
6756 case 't':
6757 case 'f':
6758 case 'v':
6759 case 'b':
6760 case 'a':
6761 case 'e':
6762 *buf = unescape_ascii(*s);
6763 rb_str_cat(undumped, (char *)buf, 1);
6764 s++;
6765 break;
6766 case 'u':
6767 if (*binary) {
6768 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6769 }
6770 *utf8 = true;
6771 if (++s >= s_end) {
6772 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6773 }
6774 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6775 if (*penc != enc_utf8) {
6776 *penc = enc_utf8;
6777 rb_enc_associate(undumped, enc_utf8);
6778 }
6779 if (*s == '{') { /* handle \u{...} form */
6780 s++;
6781 for (;;) {
6782 if (s >= s_end) {
6783 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6784 }
6785 if (*s == '}') {
6786 s++;
6787 break;
6788 }
6789 if (ISSPACE(*s)) {
6790 s++;
6791 continue;
6792 }
6793 c = scan_hex(s, s_end-s, &hexlen);
6794 if (hexlen == 0 || hexlen > 6) {
6795 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6796 }
6797 if (c > 0x10ffff) {
6798 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6799 }
6800 if (0xd800 <= c && c <= 0xdfff) {
6801 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6802 }
6803 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6804 rb_str_cat(undumped, (char *)buf, codelen);
6805 s += hexlen;
6806 }
6807 }
6808 else { /* handle \uXXXX form */
6809 c = scan_hex(s, 4, &hexlen);
6810 if (hexlen != 4) {
6811 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6812 }
6813 if (0xd800 <= c && c <= 0xdfff) {
6814 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6815 }
6816 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6817 rb_str_cat(undumped, (char *)buf, codelen);
6818 s += hexlen;
6819 }
6820 break;
6821 case 'x':
6822 if (*utf8) {
6823 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6824 }
6825 *binary = true;
6826 if (++s >= s_end) {
6827 rb_raise(rb_eRuntimeError, "invalid hex escape");
6828 }
6829 *buf = scan_hex(s, 2, &hexlen);
6830 if (hexlen != 2) {
6831 rb_raise(rb_eRuntimeError, "invalid hex escape");
6832 }
6833 rb_str_cat(undumped, (char *)buf, 1);
6834 s += hexlen;
6835 break;
6836 default:
6837 rb_str_cat(undumped, s-1, 2);
6838 s++;
6839 }
6840
6841 *ss = s;
6842}
6843
6844static VALUE rb_str_is_ascii_only_p(VALUE str);
6845
6846/*
6847 * call-seq:
6848 * undump -> string
6849 *
6850 * Returns an unescaped version of +self+:
6851 *
6852 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
6853 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6854 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6855 * s_undumped == s_orig # => true
6856 *
6857 * Related: String#dump (inverse of String#undump).
6858 *
6859 */
6860
6861static VALUE
6862str_undump(VALUE str)
6863{
6864 const char *s = RSTRING_PTR(str);
6865 const char *s_end = RSTRING_END(str);
6866 rb_encoding *enc = rb_enc_get(str);
6867 VALUE undumped = rb_enc_str_new(s, 0L, enc);
6868 bool utf8 = false;
6869 bool binary = false;
6870 int w;
6871
6873 if (rb_str_is_ascii_only_p(str) == Qfalse) {
6874 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6875 }
6876 if (!str_null_check(str, &w)) {
6877 rb_raise(rb_eRuntimeError, "string contains null byte");
6878 }
6879 if (RSTRING_LEN(str) < 2) goto invalid_format;
6880 if (*s != '"') goto invalid_format;
6881
6882 /* strip '"' at the start */
6883 s++;
6884
6885 for (;;) {
6886 if (s >= s_end) {
6887 rb_raise(rb_eRuntimeError, "unterminated dumped string");
6888 }
6889
6890 if (*s == '"') {
6891 /* epilogue */
6892 s++;
6893 if (s == s_end) {
6894 /* ascii compatible dumped string */
6895 break;
6896 }
6897 else {
6898 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6899 static const char dup_suffix[] = ".dup";
6900 const char *encname;
6901 int encidx;
6902 ptrdiff_t size;
6903
6904 /* check separately for strings dumped by older versions */
6905 size = sizeof(dup_suffix) - 1;
6906 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6907
6908 size = sizeof(force_encoding_suffix) - 1;
6909 if (s_end - s <= size) goto invalid_format;
6910 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6911 s += size;
6912
6913 if (utf8) {
6914 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6915 }
6916
6917 encname = s;
6918 s = memchr(s, '"', s_end-s);
6919 size = s - encname;
6920 if (!s) goto invalid_format;
6921 if (s_end - s != 2) goto invalid_format;
6922 if (s[0] != '"' || s[1] != ')') goto invalid_format;
6923
6924 encidx = rb_enc_find_index2(encname, (long)size);
6925 if (encidx < 0) {
6926 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6927 }
6928 rb_enc_associate_index(undumped, encidx);
6929 }
6930 break;
6931 }
6932
6933 if (*s == '\\') {
6934 s++;
6935 if (s >= s_end) {
6936 rb_raise(rb_eRuntimeError, "invalid escape");
6937 }
6938 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6939 }
6940 else {
6941 rb_str_cat(undumped, s++, 1);
6942 }
6943 }
6944
6945 return undumped;
6946invalid_format:
6947 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6948}
6949
6950static void
6951rb_str_check_dummy_enc(rb_encoding *enc)
6952{
6953 if (rb_enc_dummy_p(enc)) {
6954 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6955 rb_enc_name(enc));
6956 }
6957}
6958
6959static rb_encoding *
6960str_true_enc(VALUE str)
6961{
6962 rb_encoding *enc = STR_ENC_GET(str);
6963 rb_str_check_dummy_enc(enc);
6964 return enc;
6965}
6966
6967static OnigCaseFoldType
6968check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6969{
6970 if (argc==0)
6971 return flags;
6972 if (argc>2)
6973 rb_raise(rb_eArgError, "too many options");
6974 if (argv[0]==sym_turkic) {
6975 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6976 if (argc==2) {
6977 if (argv[1]==sym_lithuanian)
6978 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6979 else
6980 rb_raise(rb_eArgError, "invalid second option");
6981 }
6982 }
6983 else if (argv[0]==sym_lithuanian) {
6984 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6985 if (argc==2) {
6986 if (argv[1]==sym_turkic)
6987 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6988 else
6989 rb_raise(rb_eArgError, "invalid second option");
6990 }
6991 }
6992 else if (argc>1)
6993 rb_raise(rb_eArgError, "too many options");
6994 else if (argv[0]==sym_ascii)
6995 flags |= ONIGENC_CASE_ASCII_ONLY;
6996 else if (argv[0]==sym_fold) {
6997 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6998 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6999 else
7000 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7001 }
7002 else
7003 rb_raise(rb_eArgError, "invalid option");
7004 return flags;
7005}
7006
7007static inline bool
7008case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7009{
7010 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7011 return true;
7012 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7013}
7014
7015/* 16 should be long enough to absorb any kind of single character length increase */
7016#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7017#ifndef CASEMAP_DEBUG
7018# define CASEMAP_DEBUG 0
7019#endif
7020
7021struct mapping_buffer;
7022typedef struct mapping_buffer {
7023 size_t capa;
7024 size_t used;
7025 struct mapping_buffer *next;
7026 OnigUChar space[FLEX_ARY_LEN];
7028
7029static void
7030mapping_buffer_free(void *p)
7031{
7032 mapping_buffer *previous_buffer;
7033 mapping_buffer *current_buffer = p;
7034 while (current_buffer) {
7035 previous_buffer = current_buffer;
7036 current_buffer = current_buffer->next;
7037 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7038 }
7039}
7040
7041static const rb_data_type_t mapping_buffer_type = {
7042 "mapping_buffer",
7043 {0, mapping_buffer_free,}
7044};
7045
7046static VALUE
7047rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7048{
7049 VALUE target;
7050
7051 const OnigUChar *source_current, *source_end;
7052 int target_length = 0;
7053 VALUE buffer_anchor;
7054 mapping_buffer *current_buffer = 0;
7055 mapping_buffer **pre_buffer;
7056 size_t buffer_count = 0;
7057 int buffer_length_or_invalid;
7058
7059 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7060
7061 source_current = (OnigUChar*)RSTRING_PTR(source);
7062 source_end = (OnigUChar*)RSTRING_END(source);
7063
7064 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7065 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7066 while (source_current < source_end) {
7067 /* increase multiplier using buffer count to converge quickly */
7068 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7069 if (CASEMAP_DEBUG) {
7070 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7071 }
7072 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7073 *pre_buffer = current_buffer;
7074 pre_buffer = &current_buffer->next;
7075 current_buffer->next = NULL;
7076 current_buffer->capa = capa;
7077 buffer_length_or_invalid = enc->case_map(flags,
7078 &source_current, source_end,
7079 current_buffer->space,
7080 current_buffer->space+current_buffer->capa,
7081 enc);
7082 if (buffer_length_or_invalid < 0) {
7083 current_buffer = DATA_PTR(buffer_anchor);
7084 DATA_PTR(buffer_anchor) = 0;
7085 mapping_buffer_free(current_buffer);
7086 rb_raise(rb_eArgError, "input string invalid");
7087 }
7088 target_length += current_buffer->used = buffer_length_or_invalid;
7089 }
7090 if (CASEMAP_DEBUG) {
7091 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7092 }
7093
7094 if (buffer_count==1) {
7095 target = rb_str_new((const char*)current_buffer->space, target_length);
7096 }
7097 else {
7098 char *target_current;
7099
7100 target = rb_str_new(0, target_length);
7101 target_current = RSTRING_PTR(target);
7102 current_buffer = DATA_PTR(buffer_anchor);
7103 while (current_buffer) {
7104 memcpy(target_current, current_buffer->space, current_buffer->used);
7105 target_current += current_buffer->used;
7106 current_buffer = current_buffer->next;
7107 }
7108 }
7109 current_buffer = DATA_PTR(buffer_anchor);
7110 DATA_PTR(buffer_anchor) = 0;
7111 mapping_buffer_free(current_buffer);
7112
7113 RB_GC_GUARD(buffer_anchor);
7114
7115 /* TODO: check about string terminator character */
7116 str_enc_copy(target, source);
7117 /*ENC_CODERANGE_SET(mapped, cr);*/
7118
7119 return target;
7120}
7121
7122static VALUE
7123rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7124{
7125 const OnigUChar *source_current, *source_end;
7126 OnigUChar *target_current, *target_end;
7127 long old_length = RSTRING_LEN(source);
7128 int length_or_invalid;
7129
7130 if (old_length == 0) return Qnil;
7131
7132 source_current = (OnigUChar*)RSTRING_PTR(source);
7133 source_end = (OnigUChar*)RSTRING_END(source);
7134 if (source == target) {
7135 target_current = (OnigUChar*)source_current;
7136 target_end = (OnigUChar*)source_end;
7137 }
7138 else {
7139 target_current = (OnigUChar*)RSTRING_PTR(target);
7140 target_end = (OnigUChar*)RSTRING_END(target);
7141 }
7142
7143 length_or_invalid = onigenc_ascii_only_case_map(flags,
7144 &source_current, source_end,
7145 target_current, target_end, enc);
7146 if (length_or_invalid < 0)
7147 rb_raise(rb_eArgError, "input string invalid");
7148 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7149 fprintf(stderr, "problem with rb_str_ascii_casemap"
7150 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7151 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7152 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7153 }
7154
7155 str_enc_copy(target, source);
7156
7157 return target;
7158}
7159
7160static bool
7161upcase_single(VALUE str)
7162{
7163 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7164 bool modified = false;
7165
7166 while (s < send) {
7167 unsigned int c = *(unsigned char*)s;
7168
7169 if ('a' <= c && c <= 'z') {
7170 *s = 'A' + (c - 'a');
7171 modified = true;
7172 }
7173 s++;
7174 }
7175 return modified;
7176}
7177
7178/*
7179 * call-seq:
7180 * upcase!(*options) -> self or nil
7181 *
7182 * Upcases the characters in +self+;
7183 * returns +self+ if any changes were made, +nil+ otherwise:
7184 *
7185 * s = 'Hello World!' # => "Hello World!"
7186 * s.upcase! # => "HELLO WORLD!"
7187 * s # => "HELLO WORLD!"
7188 * s.upcase! # => nil
7189 *
7190 * The casing may be affected by the given +options+;
7191 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7192 *
7193 * Related: String#upcase, String#downcase, String#downcase!.
7194 *
7195 */
7196
7197static VALUE
7198rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7199{
7200 rb_encoding *enc;
7201 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7202
7203 flags = check_case_options(argc, argv, flags);
7204 str_modify_keep_cr(str);
7205 enc = str_true_enc(str);
7206 if (case_option_single_p(flags, enc, str)) {
7207 if (upcase_single(str))
7208 flags |= ONIGENC_CASE_MODIFIED;
7209 }
7210 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7211 rb_str_ascii_casemap(str, str, &flags, enc);
7212 else
7213 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7214
7215 if (ONIGENC_CASE_MODIFIED&flags) return str;
7216 return Qnil;
7217}
7218
7219
7220/*
7221 * call-seq:
7222 * upcase(*options) -> string
7223 *
7224 * Returns a string containing the upcased characters in +self+:
7225 *
7226 * s = 'Hello World!' # => "Hello World!"
7227 * s.upcase # => "HELLO WORLD!"
7228 *
7229 * The casing may be affected by the given +options+;
7230 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7231 *
7232 * Related: String#upcase!, String#downcase, String#downcase!.
7233 *
7234 */
7235
7236static VALUE
7237rb_str_upcase(int argc, VALUE *argv, VALUE str)
7238{
7239 rb_encoding *enc;
7240 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7241 VALUE ret;
7242
7243 flags = check_case_options(argc, argv, flags);
7244 enc = str_true_enc(str);
7245 if (case_option_single_p(flags, enc, str)) {
7246 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7247 str_enc_copy(ret, str);
7248 upcase_single(ret);
7249 }
7250 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7251 ret = rb_str_new(0, RSTRING_LEN(str));
7252 rb_str_ascii_casemap(str, ret, &flags, enc);
7253 }
7254 else {
7255 ret = rb_str_casemap(str, &flags, enc);
7256 }
7257
7258 return ret;
7259}
7260
7261static bool
7262downcase_single(VALUE str)
7263{
7264 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7265 bool modified = false;
7266
7267 while (s < send) {
7268 unsigned int c = *(unsigned char*)s;
7269
7270 if ('A' <= c && c <= 'Z') {
7271 *s = 'a' + (c - 'A');
7272 modified = true;
7273 }
7274 s++;
7275 }
7276
7277 return modified;
7278}
7279
7280/*
7281 * call-seq:
7282 * downcase!(*options) -> self or nil
7283 *
7284 * Downcases the characters in +self+;
7285 * returns +self+ if any changes were made, +nil+ otherwise:
7286 *
7287 * s = 'Hello World!' # => "Hello World!"
7288 * s.downcase! # => "hello world!"
7289 * s # => "hello world!"
7290 * s.downcase! # => nil
7291 *
7292 * The casing may be affected by the given +options+;
7293 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7294 *
7295 * Related: String#downcase, String#upcase, String#upcase!.
7296 *
7297 */
7298
7299static VALUE
7300rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7301{
7302 rb_encoding *enc;
7303 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7304
7305 flags = check_case_options(argc, argv, flags);
7306 str_modify_keep_cr(str);
7307 enc = str_true_enc(str);
7308 if (case_option_single_p(flags, enc, str)) {
7309 if (downcase_single(str))
7310 flags |= ONIGENC_CASE_MODIFIED;
7311 }
7312 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7313 rb_str_ascii_casemap(str, str, &flags, enc);
7314 else
7315 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7316
7317 if (ONIGENC_CASE_MODIFIED&flags) return str;
7318 return Qnil;
7319}
7320
7321
7322/*
7323 * call-seq:
7324 * downcase(*options) -> string
7325 *
7326 * Returns a string containing the downcased characters in +self+:
7327 *
7328 * s = 'Hello World!' # => "Hello World!"
7329 * s.downcase # => "hello world!"
7330 *
7331 * The casing may be affected by the given +options+;
7332 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7333 *
7334 * Related: String#downcase!, String#upcase, String#upcase!.
7335 *
7336 */
7337
7338static VALUE
7339rb_str_downcase(int argc, VALUE *argv, VALUE str)
7340{
7341 rb_encoding *enc;
7342 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7343 VALUE ret;
7344
7345 flags = check_case_options(argc, argv, flags);
7346 enc = str_true_enc(str);
7347 if (case_option_single_p(flags, enc, str)) {
7348 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7349 str_enc_copy(ret, str);
7350 downcase_single(ret);
7351 }
7352 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7353 ret = rb_str_new(0, RSTRING_LEN(str));
7354 rb_str_ascii_casemap(str, ret, &flags, enc);
7355 }
7356 else {
7357 ret = rb_str_casemap(str, &flags, enc);
7358 }
7359
7360 return ret;
7361}
7362
7363
7364/*
7365 * call-seq:
7366 * capitalize!(*options) -> self or nil
7367 *
7368 * Upcases the first character in +self+;
7369 * downcases the remaining characters;
7370 * returns +self+ if any changes were made, +nil+ otherwise:
7371 *
7372 * s = 'hello World!' # => "hello World!"
7373 * s.capitalize! # => "Hello world!"
7374 * s # => "Hello world!"
7375 * s.capitalize! # => nil
7376 *
7377 * The casing may be affected by the given +options+;
7378 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7379 *
7380 * Related: String#capitalize.
7381 *
7382 */
7383
7384static VALUE
7385rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7386{
7387 rb_encoding *enc;
7388 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7389
7390 flags = check_case_options(argc, argv, flags);
7391 str_modify_keep_cr(str);
7392 enc = str_true_enc(str);
7393 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7394 if (flags&ONIGENC_CASE_ASCII_ONLY)
7395 rb_str_ascii_casemap(str, str, &flags, enc);
7396 else
7397 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7398
7399 if (ONIGENC_CASE_MODIFIED&flags) return str;
7400 return Qnil;
7401}
7402
7403
7404/*
7405 * call-seq:
7406 * capitalize(*options) -> string
7407 *
7408 * Returns a string containing the characters in +self+;
7409 * the first character is upcased;
7410 * the remaining characters are downcased:
7411 *
7412 * s = 'hello World!' # => "hello World!"
7413 * s.capitalize # => "Hello world!"
7414 *
7415 * The casing may be affected by the given +options+;
7416 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7417 *
7418 * Related: String#capitalize!.
7419 *
7420 */
7421
7422static VALUE
7423rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7424{
7425 rb_encoding *enc;
7426 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7427 VALUE ret;
7428
7429 flags = check_case_options(argc, argv, flags);
7430 enc = str_true_enc(str);
7431 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7432 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7433 ret = rb_str_new(0, RSTRING_LEN(str));
7434 rb_str_ascii_casemap(str, ret, &flags, enc);
7435 }
7436 else {
7437 ret = rb_str_casemap(str, &flags, enc);
7438 }
7439 return ret;
7440}
7441
7442
7443/*
7444 * call-seq:
7445 * swapcase!(*options) -> self or nil
7446 *
7447 * Upcases each lowercase character in +self+;
7448 * downcases uppercase character;
7449 * returns +self+ if any changes were made, +nil+ otherwise:
7450 *
7451 * s = 'Hello World!' # => "Hello World!"
7452 * s.swapcase! # => "hELLO wORLD!"
7453 * s # => "Hello World!"
7454 * ''.swapcase! # => nil
7455 *
7456 * The casing may be affected by the given +options+;
7457 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7458 *
7459 * Related: String#swapcase.
7460 *
7461 */
7462
7463static VALUE
7464rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7465{
7466 rb_encoding *enc;
7467 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7468
7469 flags = check_case_options(argc, argv, flags);
7470 str_modify_keep_cr(str);
7471 enc = str_true_enc(str);
7472 if (flags&ONIGENC_CASE_ASCII_ONLY)
7473 rb_str_ascii_casemap(str, str, &flags, enc);
7474 else
7475 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7476
7477 if (ONIGENC_CASE_MODIFIED&flags) return str;
7478 return Qnil;
7479}
7480
7481
7482/*
7483 * call-seq:
7484 * swapcase(*options) -> string
7485 *
7486 * Returns a string containing the characters in +self+, with cases reversed;
7487 * each uppercase character is downcased;
7488 * each lowercase character is upcased:
7489 *
7490 * s = 'Hello World!' # => "Hello World!"
7491 * s.swapcase # => "hELLO wORLD!"
7492 *
7493 * The casing may be affected by the given +options+;
7494 * see {Case Mapping}[doc/case_mapping_rdoc.html].
7495 *
7496 * Related: String#swapcase!.
7497 *
7498 */
7499
7500static VALUE
7501rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7502{
7503 rb_encoding *enc;
7504 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7505 VALUE ret;
7506
7507 flags = check_case_options(argc, argv, flags);
7508 enc = str_true_enc(str);
7509 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7510 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7511 ret = rb_str_new(0, RSTRING_LEN(str));
7512 rb_str_ascii_casemap(str, ret, &flags, enc);
7513 }
7514 else {
7515 ret = rb_str_casemap(str, &flags, enc);
7516 }
7517 return ret;
7518}
7519
7520typedef unsigned char *USTR;
7521
7522struct tr {
7523 int gen;
7524 unsigned int now, max;
7525 char *p, *pend;
7526};
7527
7528static unsigned int
7529trnext(struct tr *t, rb_encoding *enc)
7530{
7531 int n;
7532
7533 for (;;) {
7534 nextpart:
7535 if (!t->gen) {
7536 if (t->p == t->pend) return -1;
7537 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7538 t->p += n;
7539 }
7540 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7541 t->p += n;
7542 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7543 t->p += n;
7544 if (t->p < t->pend) {
7545 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7546 t->p += n;
7547 if (t->now > c) {
7548 if (t->now < 0x80 && c < 0x80) {
7550 "invalid range \"%c-%c\" in string transliteration",
7551 t->now, c);
7552 }
7553 else {
7554 rb_raise(rb_eArgError, "invalid range in string transliteration");
7555 }
7556 continue; /* not reached */
7557 }
7558 t->gen = 1;
7559 t->max = c;
7560 }
7561 }
7562 return t->now;
7563 }
7564 else {
7565 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7566 if (t->now == t->max) {
7567 t->gen = 0;
7568 goto nextpart;
7569 }
7570 }
7571 if (t->now < t->max) {
7572 return t->now;
7573 }
7574 else {
7575 t->gen = 0;
7576 return t->max;
7577 }
7578 }
7579 }
7580}
7581
7582static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7583
7584static VALUE
7585tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7586{
7587 const unsigned int errc = -1;
7588 unsigned int trans[256];
7589 rb_encoding *enc, *e1, *e2;
7590 struct tr trsrc, trrepl;
7591 int cflag = 0;
7592 unsigned int c, c0, last = 0;
7593 int modify = 0, i, l;
7594 unsigned char *s, *send;
7595 VALUE hash = 0;
7596 int singlebyte = single_byte_optimizable(str);
7597 int termlen;
7598 int cr;
7599
7600#define CHECK_IF_ASCII(c) \
7601 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7602 (cr = ENC_CODERANGE_VALID) : 0)
7603
7604 StringValue(src);
7605 StringValue(repl);
7606 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7607 if (RSTRING_LEN(repl) == 0) {
7608 return rb_str_delete_bang(1, &src, str);
7609 }
7610
7611 cr = ENC_CODERANGE(str);
7612 e1 = rb_enc_check(str, src);
7613 e2 = rb_enc_check(str, repl);
7614 if (e1 == e2) {
7615 enc = e1;
7616 }
7617 else {
7618 enc = rb_enc_check(src, repl);
7619 }
7620 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7621 if (RSTRING_LEN(src) > 1 &&
7622 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7623 trsrc.p + l < trsrc.pend) {
7624 cflag = 1;
7625 trsrc.p += l;
7626 }
7627 trrepl.p = RSTRING_PTR(repl);
7628 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7629 trsrc.gen = trrepl.gen = 0;
7630 trsrc.now = trrepl.now = 0;
7631 trsrc.max = trrepl.max = 0;
7632
7633 if (cflag) {
7634 for (i=0; i<256; i++) {
7635 trans[i] = 1;
7636 }
7637 while ((c = trnext(&trsrc, enc)) != errc) {
7638 if (c < 256) {
7639 trans[c] = errc;
7640 }
7641 else {
7642 if (!hash) hash = rb_hash_new();
7643 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7644 }
7645 }
7646 while ((c = trnext(&trrepl, enc)) != errc)
7647 /* retrieve last replacer */;
7648 last = trrepl.now;
7649 for (i=0; i<256; i++) {
7650 if (trans[i] != errc) {
7651 trans[i] = last;
7652 }
7653 }
7654 }
7655 else {
7656 unsigned int r;
7657
7658 for (i=0; i<256; i++) {
7659 trans[i] = errc;
7660 }
7661 while ((c = trnext(&trsrc, enc)) != errc) {
7662 r = trnext(&trrepl, enc);
7663 if (r == errc) r = trrepl.now;
7664 if (c < 256) {
7665 trans[c] = r;
7666 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7667 }
7668 else {
7669 if (!hash) hash = rb_hash_new();
7670 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7671 }
7672 }
7673 }
7674
7675 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7676 cr = ENC_CODERANGE_7BIT;
7677 str_modify_keep_cr(str);
7678 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7679 termlen = rb_enc_mbminlen(enc);
7680 if (sflag) {
7681 int clen, tlen;
7682 long offset, max = RSTRING_LEN(str);
7683 unsigned int save = -1;
7684 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7685
7686 while (s < send) {
7687 int may_modify = 0;
7688
7689 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7690 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7691
7692 s += clen;
7693 if (c < 256) {
7694 c = trans[c];
7695 }
7696 else if (hash) {
7697 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7698 if (NIL_P(tmp)) {
7699 if (cflag) c = last;
7700 else c = errc;
7701 }
7702 else if (cflag) c = errc;
7703 else c = NUM2INT(tmp);
7704 }
7705 else {
7706 c = errc;
7707 }
7708 if (c != (unsigned int)-1) {
7709 if (save == c) {
7710 CHECK_IF_ASCII(c);
7711 continue;
7712 }
7713 save = c;
7714 tlen = rb_enc_codelen(c, enc);
7715 modify = 1;
7716 }
7717 else {
7718 save = -1;
7719 c = c0;
7720 if (enc != e1) may_modify = 1;
7721 }
7722 if ((offset = t - buf) + tlen > max) {
7723 size_t MAYBE_UNUSED(old) = max + termlen;
7724 max = offset + tlen + (send - s);
7725 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7726 t = buf + offset;
7727 }
7728 rb_enc_mbcput(c, t, enc);
7729 if (may_modify && memcmp(s, t, tlen) != 0) {
7730 modify = 1;
7731 }
7732 CHECK_IF_ASCII(c);
7733 t += tlen;
7734 }
7735 if (!STR_EMBED_P(str)) {
7736 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7737 }
7738 TERM_FILL((char *)t, termlen);
7739 RSTRING(str)->as.heap.ptr = (char *)buf;
7740 RSTRING(str)->as.heap.len = t - buf;
7741 STR_SET_NOEMBED(str);
7742 RSTRING(str)->as.heap.aux.capa = max;
7743 }
7744 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7745 while (s < send) {
7746 c = (unsigned char)*s;
7747 if (trans[c] != errc) {
7748 if (!cflag) {
7749 c = trans[c];
7750 *s = c;
7751 modify = 1;
7752 }
7753 else {
7754 *s = last;
7755 modify = 1;
7756 }
7757 }
7758 CHECK_IF_ASCII(c);
7759 s++;
7760 }
7761 }
7762 else {
7763 int clen, tlen;
7764 long offset, max = (long)((send - s) * 1.2);
7765 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7766
7767 while (s < send) {
7768 int may_modify = 0;
7769 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7770 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7771
7772 if (c < 256) {
7773 c = trans[c];
7774 }
7775 else if (hash) {
7776 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7777 if (NIL_P(tmp)) {
7778 if (cflag) c = last;
7779 else c = errc;
7780 }
7781 else if (cflag) c = errc;
7782 else c = NUM2INT(tmp);
7783 }
7784 else {
7785 c = cflag ? last : errc;
7786 }
7787 if (c != errc) {
7788 tlen = rb_enc_codelen(c, enc);
7789 modify = 1;
7790 }
7791 else {
7792 c = c0;
7793 if (enc != e1) may_modify = 1;
7794 }
7795 if ((offset = t - buf) + tlen > max) {
7796 size_t MAYBE_UNUSED(old) = max + termlen;
7797 max = offset + tlen + (long)((send - s) * 1.2);
7798 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7799 t = buf + offset;
7800 }
7801 if (s != t) {
7802 rb_enc_mbcput(c, t, enc);
7803 if (may_modify && memcmp(s, t, tlen) != 0) {
7804 modify = 1;
7805 }
7806 }
7807 CHECK_IF_ASCII(c);
7808 s += clen;
7809 t += tlen;
7810 }
7811 if (!STR_EMBED_P(str)) {
7812 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7813 }
7814 TERM_FILL((char *)t, termlen);
7815 RSTRING(str)->as.heap.ptr = (char *)buf;
7816 RSTRING(str)->as.heap.len = t - buf;
7817 STR_SET_NOEMBED(str);
7818 RSTRING(str)->as.heap.aux.capa = max;
7819 }
7820
7821 if (modify) {
7822 if (cr != ENC_CODERANGE_BROKEN)
7823 ENC_CODERANGE_SET(str, cr);
7824 rb_enc_associate(str, enc);
7825 return str;
7826 }
7827 return Qnil;
7828}
7829
7830
7831/*
7832 * call-seq:
7833 * str.tr!(from_str, to_str) -> str or nil
7834 *
7835 * Translates <i>str</i> in place, using the same rules as
7836 * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7837 * were made.
7838 */
7839
7840static VALUE
7841rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7842{
7843 return tr_trans(str, src, repl, 0);
7844}
7845
7846
7847/*
7848 * call-seq:
7849 * str.tr(from_str, to_str) => new_str
7850 *
7851 * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7852 * corresponding characters in +to_str+. If +to_str+ is shorter than
7853 * +from_str+, it is padded with its last character in order to maintain the
7854 * correspondence.
7855 *
7856 * "hello".tr('el', 'ip') #=> "hippo"
7857 * "hello".tr('aeiou', '*') #=> "h*ll*"
7858 * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7859 *
7860 * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7861 * characters, and +from_str+ may start with a <code>^</code>, which denotes
7862 * all characters except those listed.
7863 *
7864 * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7865 * "hello".tr('^aeiou', '*') #=> "*e**o"
7866 *
7867 * The backslash character <code>\</code> can be used to escape
7868 * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7869 * appears at the end of a range or the end of the +from_str+ or +to_str+:
7870 *
7871 * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7872 * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7873 *
7874 * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7875 * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7876 * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7877 *
7878 * "X['\\b']".tr("X\\", "") #=> "['b']"
7879 * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7880 */
7881
7882static VALUE
7883rb_str_tr(VALUE str, VALUE src, VALUE repl)
7884{
7885 str = str_duplicate(rb_cString, str);
7886 tr_trans(str, src, repl, 0);
7887 return str;
7888}
7889
7890#define TR_TABLE_MAX (UCHAR_MAX+1)
7891#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7892static void
7893tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7894 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7895{
7896 const unsigned int errc = -1;
7897 char buf[TR_TABLE_MAX];
7898 struct tr tr;
7899 unsigned int c;
7900 VALUE table = 0, ptable = 0;
7901 int i, l, cflag = 0;
7902
7903 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7904 tr.gen = tr.now = tr.max = 0;
7905
7906 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7907 cflag = 1;
7908 tr.p += l;
7909 }
7910 if (first) {
7911 for (i=0; i<TR_TABLE_MAX; i++) {
7912 stable[i] = 1;
7913 }
7914 stable[TR_TABLE_MAX] = cflag;
7915 }
7916 else if (stable[TR_TABLE_MAX] && !cflag) {
7917 stable[TR_TABLE_MAX] = 0;
7918 }
7919 for (i=0; i<TR_TABLE_MAX; i++) {
7920 buf[i] = cflag;
7921 }
7922
7923 while ((c = trnext(&tr, enc)) != errc) {
7924 if (c < TR_TABLE_MAX) {
7925 buf[(unsigned char)c] = !cflag;
7926 }
7927 else {
7928 VALUE key = UINT2NUM(c);
7929
7930 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7931 if (cflag) {
7932 ptable = *ctablep;
7933 table = ptable ? ptable : rb_hash_new();
7934 *ctablep = table;
7935 }
7936 else {
7937 table = rb_hash_new();
7938 ptable = *tablep;
7939 *tablep = table;
7940 }
7941 }
7942 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7943 rb_hash_aset(table, key, Qtrue);
7944 }
7945 }
7946 }
7947 for (i=0; i<TR_TABLE_MAX; i++) {
7948 stable[i] = stable[i] && buf[i];
7949 }
7950 if (!table && !cflag) {
7951 *tablep = 0;
7952 }
7953}
7954
7955
7956static int
7957tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7958{
7959 if (c < TR_TABLE_MAX) {
7960 return table[c] != 0;
7961 }
7962 else {
7963 VALUE v = UINT2NUM(c);
7964
7965 if (del) {
7966 if (!NIL_P(rb_hash_lookup(del, v)) &&
7967 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7968 return TRUE;
7969 }
7970 }
7971 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7972 return FALSE;
7973 }
7974 return table[TR_TABLE_MAX] ? TRUE : FALSE;
7975 }
7976}
7977
7978/*
7979 * call-seq:
7980 * str.delete!([other_str]+) -> str or nil
7981 *
7982 * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7983 * <code>nil</code> if <i>str</i> was not modified.
7984 */
7985
7986static VALUE
7987rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7988{
7989 char squeez[TR_TABLE_SIZE];
7990 rb_encoding *enc = 0;
7991 char *s, *send, *t;
7992 VALUE del = 0, nodel = 0;
7993 int modify = 0;
7994 int i, ascompat, cr;
7995
7996 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7998 for (i=0; i<argc; i++) {
7999 VALUE s = argv[i];
8000
8001 StringValue(s);
8002 enc = rb_enc_check(str, s);
8003 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8004 }
8005
8006 str_modify_keep_cr(str);
8007 ascompat = rb_enc_asciicompat(enc);
8008 s = t = RSTRING_PTR(str);
8009 send = RSTRING_END(str);
8010 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8011 while (s < send) {
8012 unsigned int c;
8013 int clen;
8014
8015 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8016 if (squeez[c]) {
8017 modify = 1;
8018 }
8019 else {
8020 if (t != s) *t = c;
8021 t++;
8022 }
8023 s++;
8024 }
8025 else {
8026 c = rb_enc_codepoint_len(s, send, &clen, enc);
8027
8028 if (tr_find(c, squeez, del, nodel)) {
8029 modify = 1;
8030 }
8031 else {
8032 if (t != s) rb_enc_mbcput(c, t, enc);
8033 t += clen;
8035 }
8036 s += clen;
8037 }
8038 }
8039 TERM_FILL(t, TERM_LEN(str));
8040 STR_SET_LEN(str, t - RSTRING_PTR(str));
8041 ENC_CODERANGE_SET(str, cr);
8042
8043 if (modify) return str;
8044 return Qnil;
8045}
8046
8047
8048/*
8049 * call-seq:
8050 * str.delete([other_str]+) -> new_str
8051 *
8052 * Returns a copy of <i>str</i> with all characters in the intersection of its
8053 * arguments deleted. Uses the same rules for building the set of characters as
8054 * String#count.
8055 *
8056 * "hello".delete "l","lo" #=> "heo"
8057 * "hello".delete "lo" #=> "he"
8058 * "hello".delete "aeiou", "^e" #=> "hell"
8059 * "hello".delete "ej-m" #=> "ho"
8060 */
8061
8062static VALUE
8063rb_str_delete(int argc, VALUE *argv, VALUE str)
8064{
8065 str = str_duplicate(rb_cString, str);
8066 rb_str_delete_bang(argc, argv, str);
8067 return str;
8068}
8069
8070
8071/*
8072 * call-seq:
8073 * str.squeeze!([other_str]*) -> str or nil
8074 *
8075 * Squeezes <i>str</i> in place, returning either <i>str</i>, or
8076 * <code>nil</code> if no changes were made.
8077 */
8078
8079static VALUE
8080rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8081{
8082 char squeez[TR_TABLE_SIZE];
8083 rb_encoding *enc = 0;
8084 VALUE del = 0, nodel = 0;
8085 unsigned char *s, *send, *t;
8086 int i, modify = 0;
8087 int ascompat, singlebyte = single_byte_optimizable(str);
8088 unsigned int save;
8089
8090 if (argc == 0) {
8091 enc = STR_ENC_GET(str);
8092 }
8093 else {
8094 for (i=0; i<argc; i++) {
8095 VALUE s = argv[i];
8096
8097 StringValue(s);
8098 enc = rb_enc_check(str, s);
8099 if (singlebyte && !single_byte_optimizable(s))
8100 singlebyte = 0;
8101 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8102 }
8103 }
8104
8105 str_modify_keep_cr(str);
8106 s = t = (unsigned char *)RSTRING_PTR(str);
8107 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8108 send = (unsigned char *)RSTRING_END(str);
8109 save = -1;
8110 ascompat = rb_enc_asciicompat(enc);
8111
8112 if (singlebyte) {
8113 while (s < send) {
8114 unsigned int c = *s++;
8115 if (c != save || (argc > 0 && !squeez[c])) {
8116 *t++ = save = c;
8117 }
8118 }
8119 }
8120 else {
8121 while (s < send) {
8122 unsigned int c;
8123 int clen;
8124
8125 if (ascompat && (c = *s) < 0x80) {
8126 if (c != save || (argc > 0 && !squeez[c])) {
8127 *t++ = save = c;
8128 }
8129 s++;
8130 }
8131 else {
8132 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8133
8134 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8135 if (t != s) rb_enc_mbcput(c, t, enc);
8136 save = c;
8137 t += clen;
8138 }
8139 s += clen;
8140 }
8141 }
8142 }
8143
8144 TERM_FILL((char *)t, TERM_LEN(str));
8145 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8146 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8147 modify = 1;
8148 }
8149
8150 if (modify) return str;
8151 return Qnil;
8152}
8153
8154
8155/*
8156 * call-seq:
8157 * str.squeeze([other_str]*) -> new_str
8158 *
8159 * Builds a set of characters from the <i>other_str</i> parameter(s)
8160 * using the procedure described for String#count. Returns a new
8161 * string where runs of the same character that occur in this set are
8162 * replaced by a single character. If no arguments are given, all
8163 * runs of identical characters are replaced by a single character.
8164 *
8165 * "yellow moon".squeeze #=> "yelow mon"
8166 * " now is the".squeeze(" ") #=> " now is the"
8167 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8168 */
8169
8170static VALUE
8171rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8172{
8173 str = str_duplicate(rb_cString, str);
8174 rb_str_squeeze_bang(argc, argv, str);
8175 return str;
8176}
8177
8178
8179/*
8180 * call-seq:
8181 * str.tr_s!(from_str, to_str) -> str or nil
8182 *
8183 * Performs String#tr_s processing on <i>str</i> in place,
8184 * returning <i>str</i>, or <code>nil</code> if no changes were made.
8185 */
8186
8187static VALUE
8188rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8189{
8190 return tr_trans(str, src, repl, 1);
8191}
8192
8193
8194/*
8195 * call-seq:
8196 * str.tr_s(from_str, to_str) -> new_str
8197 *
8198 * Processes a copy of <i>str</i> as described under String#tr, then
8199 * removes duplicate characters in regions that were affected by the
8200 * translation.
8201 *
8202 * "hello".tr_s('l', 'r') #=> "hero"
8203 * "hello".tr_s('el', '*') #=> "h*o"
8204 * "hello".tr_s('el', 'hx') #=> "hhxo"
8205 */
8206
8207static VALUE
8208rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8209{
8210 str = str_duplicate(rb_cString, str);
8211 tr_trans(str, src, repl, 1);
8212 return str;
8213}
8214
8215
8216/*
8217 * call-seq:
8218 * str.count([other_str]+) -> integer
8219 *
8220 * Each +other_str+ parameter defines a set of characters to count. The
8221 * intersection of these sets defines the characters to count in +str+. Any
8222 * +other_str+ that starts with a caret <code>^</code> is negated. The
8223 * sequence <code>c1-c2</code> means all characters between c1 and c2. The
8224 * backslash character <code>\</code> can be used to escape <code>^</code> or
8225 * <code>-</code> and is otherwise ignored unless it appears at the end of a
8226 * sequence or the end of a +other_str+.
8227 *
8228 * a = "hello world"
8229 * a.count "lo" #=> 5
8230 * a.count "lo", "o" #=> 2
8231 * a.count "hello", "^l" #=> 4
8232 * a.count "ej-m" #=> 4
8233 *
8234 * "hello^world".count "\\^aeiou" #=> 4
8235 * "hello-world".count "a\\-eo" #=> 4
8236 *
8237 * c = "hello world\\r\\n"
8238 * c.count "\\" #=> 2
8239 * c.count "\\A" #=> 0
8240 * c.count "X-\\w" #=> 3
8241 */
8242
8243static VALUE
8244rb_str_count(int argc, VALUE *argv, VALUE str)
8245{
8246 char table[TR_TABLE_SIZE];
8247 rb_encoding *enc = 0;
8248 VALUE del = 0, nodel = 0, tstr;
8249 char *s, *send;
8250 int i;
8251 int ascompat;
8252 size_t n = 0;
8253
8255
8256 tstr = argv[0];
8257 StringValue(tstr);
8258 enc = rb_enc_check(str, tstr);
8259 if (argc == 1) {
8260 const char *ptstr;
8261 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8262 (ptstr = RSTRING_PTR(tstr),
8263 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8264 !is_broken_string(str)) {
8265 int clen;
8266 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8267
8268 s = RSTRING_PTR(str);
8269 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8270 send = RSTRING_END(str);
8271 while (s < send) {
8272 if (*(unsigned char*)s++ == c) n++;
8273 }
8274 return SIZET2NUM(n);
8275 }
8276 }
8277
8278 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8279 for (i=1; i<argc; i++) {
8280 tstr = argv[i];
8281 StringValue(tstr);
8282 enc = rb_enc_check(str, tstr);
8283 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8284 }
8285
8286 s = RSTRING_PTR(str);
8287 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8288 send = RSTRING_END(str);
8289 ascompat = rb_enc_asciicompat(enc);
8290 while (s < send) {
8291 unsigned int c;
8292
8293 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8294 if (table[c]) {
8295 n++;
8296 }
8297 s++;
8298 }
8299 else {
8300 int clen;
8301 c = rb_enc_codepoint_len(s, send, &clen, enc);
8302 if (tr_find(c, table, del, nodel)) {
8303 n++;
8304 }
8305 s += clen;
8306 }
8307 }
8308
8309 return SIZET2NUM(n);
8310}
8311
8312static VALUE
8313rb_fs_check(VALUE val)
8314{
8315 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8316 val = rb_check_string_type(val);
8317 if (NIL_P(val)) return 0;
8318 }
8319 return val;
8320}
8321
8322static const char isspacetable[256] = {
8323 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8324 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8325 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8329 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8336 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8337 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8338 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8339};
8340
8341#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8342
8343static long
8344split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8345{
8346 if (empty_count >= 0 && len == 0) {
8347 return empty_count + 1;
8348 }
8349 if (empty_count > 0) {
8350 /* make different substrings */
8351 if (result) {
8352 do {
8353 rb_ary_push(result, str_new_empty_String(str));
8354 } while (--empty_count > 0);
8355 }
8356 else {
8357 do {
8358 rb_yield(str_new_empty_String(str));
8359 } while (--empty_count > 0);
8360 }
8361 }
8362 str = rb_str_subseq(str, beg, len);
8363 if (result) {
8364 rb_ary_push(result, str);
8365 }
8366 else {
8367 rb_yield(str);
8368 }
8369 return empty_count;
8370}
8371
8372typedef enum {
8373 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8374} split_type_t;
8375
8376static split_type_t
8377literal_split_pattern(VALUE spat, split_type_t default_type)
8378{
8379 rb_encoding *enc = STR_ENC_GET(spat);
8380 const char *ptr;
8381 long len;
8382 RSTRING_GETMEM(spat, ptr, len);
8383 if (len == 0) {
8384 /* Special case - split into chars */
8385 return SPLIT_TYPE_CHARS;
8386 }
8387 else if (rb_enc_asciicompat(enc)) {
8388 if (len == 1 && ptr[0] == ' ') {
8389 return SPLIT_TYPE_AWK;
8390 }
8391 }
8392 else {
8393 int l;
8394 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8395 return SPLIT_TYPE_AWK;
8396 }
8397 }
8398 return default_type;
8399}
8400
8401/*
8402 * call-seq:
8403 * str.split(pattern=nil, [limit]) -> an_array
8404 * str.split(pattern=nil, [limit]) {|sub| block } -> str
8405 *
8406 * Divides <i>str</i> into substrings based on a delimiter, returning an array
8407 * of these substrings.
8408 *
8409 * If <i>pattern</i> is a String, then its contents are used as
8410 * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8411 * space, <i>str</i> is split on whitespace, with leading and trailing
8412 * whitespace and runs of contiguous whitespace characters ignored.
8413 *
8414 * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8415 * pattern matches. Whenever the pattern matches a zero-length string,
8416 * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8417 * groups, the respective matches will be returned in the array as well.
8418 *
8419 * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8420 * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8421 * split on whitespace as if ' ' were specified.
8422 *
8423 * If the <i>limit</i> parameter is omitted, trailing null fields are
8424 * suppressed. If <i>limit</i> is a positive number, at most that number
8425 * of split substrings will be returned (captured groups will be returned
8426 * as well, but are not counted towards the limit).
8427 * If <i>limit</i> is <code>1</code>, the entire
8428 * string is returned as the only entry in an array. If negative, there is no
8429 * limit to the number of fields returned, and trailing null fields are not
8430 * suppressed.
8431 *
8432 * When the input +str+ is empty an empty Array is returned as the string is
8433 * considered to have no fields to split.
8434 *
8435 * " now's the time ".split #=> ["now's", "the", "time"]
8436 * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8437 * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8438 * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8439 * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8440 * "hello".split(//, 3) #=> ["h", "e", "llo"]
8441 * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8442 *
8443 * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8444 * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8445 * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8446 * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8447 *
8448 * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8449 *
8450 * "".split(',', -1) #=> []
8451 *
8452 * If a block is given, invoke the block with each split substring.
8453 *
8454 */
8455
8456static VALUE
8457rb_str_split_m(int argc, VALUE *argv, VALUE str)
8458{
8459 rb_encoding *enc;
8460 VALUE spat;
8461 VALUE limit;
8462 split_type_t split_type;
8463 long beg, end, i = 0, empty_count = -1;
8464 int lim = 0;
8465 VALUE result, tmp;
8466
8467 result = rb_block_given_p() ? Qfalse : Qnil;
8468 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8469 lim = NUM2INT(limit);
8470 if (lim <= 0) limit = Qnil;
8471 else if (lim == 1) {
8472 if (RSTRING_LEN(str) == 0)
8473 return result ? rb_ary_new2(0) : str;
8474 tmp = str_duplicate(rb_cString, str);
8475 if (!result) {
8476 rb_yield(tmp);
8477 return str;
8478 }
8479 return rb_ary_new3(1, tmp);
8480 }
8481 i = 1;
8482 }
8483 if (NIL_P(limit) && !lim) empty_count = 0;
8484
8485 enc = STR_ENC_GET(str);
8486 split_type = SPLIT_TYPE_REGEXP;
8487 if (!NIL_P(spat)) {
8488 spat = get_pat_quoted(spat, 0);
8489 }
8490 else if (NIL_P(spat = rb_fs)) {
8491 split_type = SPLIT_TYPE_AWK;
8492 }
8493 else if (!(spat = rb_fs_check(spat))) {
8494 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8495 }
8496 else {
8497 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8498 }
8499 if (split_type != SPLIT_TYPE_AWK) {
8500 switch (BUILTIN_TYPE(spat)) {
8501 case T_REGEXP:
8502 rb_reg_options(spat); /* check if uninitialized */
8503 tmp = RREGEXP_SRC(spat);
8504 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8505 if (split_type == SPLIT_TYPE_AWK) {
8506 spat = tmp;
8507 split_type = SPLIT_TYPE_STRING;
8508 }
8509 break;
8510
8511 case T_STRING:
8512 mustnot_broken(spat);
8513 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8514 break;
8515
8516 default:
8518 }
8519 }
8520
8521#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8522
8523 if (result) result = rb_ary_new();
8524 beg = 0;
8525 char *ptr = RSTRING_PTR(str);
8526 char *eptr = RSTRING_END(str);
8527 if (split_type == SPLIT_TYPE_AWK) {
8528 char *bptr = ptr;
8529 int skip = 1;
8530 unsigned int c;
8531
8532 end = beg;
8533 if (is_ascii_string(str)) {
8534 while (ptr < eptr) {
8535 c = (unsigned char)*ptr++;
8536 if (skip) {
8537 if (ascii_isspace(c)) {
8538 beg = ptr - bptr;
8539 }
8540 else {
8541 end = ptr - bptr;
8542 skip = 0;
8543 if (!NIL_P(limit) && lim <= i) break;
8544 }
8545 }
8546 else if (ascii_isspace(c)) {
8547 SPLIT_STR(beg, end-beg);
8548 skip = 1;
8549 beg = ptr - bptr;
8550 if (!NIL_P(limit)) ++i;
8551 }
8552 else {
8553 end = ptr - bptr;
8554 }
8555 }
8556 }
8557 else {
8558 while (ptr < eptr) {
8559 int n;
8560
8561 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8562 ptr += n;
8563 if (skip) {
8564 if (rb_isspace(c)) {
8565 beg = ptr - bptr;
8566 }
8567 else {
8568 end = ptr - bptr;
8569 skip = 0;
8570 if (!NIL_P(limit) && lim <= i) break;
8571 }
8572 }
8573 else if (rb_isspace(c)) {
8574 SPLIT_STR(beg, end-beg);
8575 skip = 1;
8576 beg = ptr - bptr;
8577 if (!NIL_P(limit)) ++i;
8578 }
8579 else {
8580 end = ptr - bptr;
8581 }
8582 }
8583 }
8584 }
8585 else if (split_type == SPLIT_TYPE_STRING) {
8586 char *str_start = ptr;
8587 char *substr_start = ptr;
8588 char *sptr = RSTRING_PTR(spat);
8589 long slen = RSTRING_LEN(spat);
8590
8591 mustnot_broken(str);
8592 enc = rb_enc_check(str, spat);
8593 while (ptr < eptr &&
8594 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8595 /* Check we are at the start of a char */
8596 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8597 if (t != ptr + end) {
8598 ptr = t;
8599 continue;
8600 }
8601 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8602 ptr += end + slen;
8603 substr_start = ptr;
8604 if (!NIL_P(limit) && lim <= ++i) break;
8605 }
8606 beg = ptr - str_start;
8607 }
8608 else if (split_type == SPLIT_TYPE_CHARS) {
8609 char *str_start = ptr;
8610 int n;
8611
8612 mustnot_broken(str);
8613 enc = rb_enc_get(str);
8614 while (ptr < eptr &&
8615 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8616 SPLIT_STR(ptr - str_start, n);
8617 ptr += n;
8618 if (!NIL_P(limit) && lim <= ++i) break;
8619 }
8620 beg = ptr - str_start;
8621 }
8622 else {
8623 long len = RSTRING_LEN(str);
8624 long start = beg;
8625 long idx;
8626 int last_null = 0;
8627 struct re_registers *regs;
8628 VALUE match = 0;
8629
8630 for (; rb_reg_search(spat, str, start, 0) >= 0;
8631 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8632 match = rb_backref_get();
8633 if (!result) rb_match_busy(match);
8634 regs = RMATCH_REGS(match);
8635 end = BEG(0);
8636 if (start == end && BEG(0) == END(0)) {
8637 if (!ptr) {
8638 SPLIT_STR(0, 0);
8639 break;
8640 }
8641 else if (last_null == 1) {
8642 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8643 beg = start;
8644 }
8645 else {
8646 if (start == len)
8647 start++;
8648 else
8649 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8650 last_null = 1;
8651 continue;
8652 }
8653 }
8654 else {
8655 SPLIT_STR(beg, end-beg);
8656 beg = start = END(0);
8657 }
8658 last_null = 0;
8659
8660 for (idx=1; idx < regs->num_regs; idx++) {
8661 if (BEG(idx) == -1) continue;
8662 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8663 }
8664 if (!NIL_P(limit) && lim <= ++i) break;
8665 }
8666 if (match) rb_match_unbusy(match);
8667 }
8668 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8669 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8670 }
8671
8672 return result ? result : str;
8673}
8674
8675VALUE
8676rb_str_split(VALUE str, const char *sep0)
8677{
8678 VALUE sep;
8679
8680 StringValue(str);
8681 sep = rb_str_new_cstr(sep0);
8682 return rb_str_split_m(1, &sep, str);
8683}
8684
8685#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8686
8687static inline int
8688enumerator_element(VALUE ary, VALUE e)
8689{
8690 if (ary) {
8691 rb_ary_push(ary, e);
8692 return 0;
8693 }
8694 else {
8695 rb_yield(e);
8696 return 1;
8697 }
8698}
8699
8700#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8701
8702static const char *
8703chomp_newline(const char *p, const char *e, rb_encoding *enc)
8704{
8705 const char *prev = rb_enc_prev_char(p, e, e, enc);
8706 if (rb_enc_is_newline(prev, e, enc)) {
8707 e = prev;
8708 prev = rb_enc_prev_char(p, e, e, enc);
8709 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8710 e = prev;
8711 }
8712 return e;
8713}
8714
8715static VALUE
8716get_rs(void)
8717{
8718 VALUE rs = rb_rs;
8719 if (!NIL_P(rs) &&
8720 (!RB_TYPE_P(rs, T_STRING) ||
8721 RSTRING_LEN(rs) != 1 ||
8722 RSTRING_PTR(rs)[0] != '\n')) {
8723 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8724 }
8725 return rs;
8726}
8727
8728#define rb_rs get_rs()
8729
8730static VALUE
8731rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8732{
8733 rb_encoding *enc;
8734 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8735 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8736 long pos, len, rslen;
8737 int rsnewline = 0;
8738
8739 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8740 rs = rb_rs;
8741 if (!NIL_P(opts)) {
8742 static ID keywords[1];
8743 if (!keywords[0]) {
8744 keywords[0] = rb_intern_const("chomp");
8745 }
8746 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8747 chomp = (chomp != Qundef && RTEST(chomp));
8748 }
8749
8750 if (NIL_P(rs)) {
8751 if (!ENUM_ELEM(ary, str)) {
8752 return ary;
8753 }
8754 else {
8755 return orig;
8756 }
8757 }
8758
8759 if (!RSTRING_LEN(str)) goto end;
8760 str = rb_str_new_frozen(str);
8761 ptr = subptr = RSTRING_PTR(str);
8762 pend = RSTRING_END(str);
8763 len = RSTRING_LEN(str);
8764 StringValue(rs);
8765 rslen = RSTRING_LEN(rs);
8766
8767 if (rs == rb_default_rs)
8768 enc = rb_enc_get(str);
8769 else
8770 enc = rb_enc_check(str, rs);
8771
8772 if (rslen == 0) {
8773 /* paragraph mode */
8774 int n;
8775 const char *eol = NULL;
8776 subend = subptr;
8777 while (subend < pend) {
8778 do {
8779 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8780 n = 0;
8781 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8782 if (rb_enc_is_newline(subend + n, pend, enc)) {
8783 if (eol == subend) break;
8784 subend += rslen;
8785 if (subptr) eol = subend;
8786 }
8787 else {
8788 if (!subptr) subptr = subend;
8789 subend += rslen;
8790 }
8791 rslen = 0;
8792 } while (subend < pend);
8793 if (!subptr) break;
8794 line = rb_str_subseq(str, subptr - ptr,
8795 subend - subptr + (chomp ? 0 : rslen));
8796 if (ENUM_ELEM(ary, line)) {
8797 str_mod_check(str, ptr, len);
8798 }
8799 subptr = eol = NULL;
8800 }
8801 goto end;
8802 }
8803 else {
8804 rsptr = RSTRING_PTR(rs);
8805 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8806 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8807 rsnewline = 1;
8808 }
8809 }
8810
8811 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8812 rs = rb_str_new(rsptr, rslen);
8813 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8814 rsptr = RSTRING_PTR(rs);
8815 rslen = RSTRING_LEN(rs);
8816 }
8817
8818 while (subptr < pend) {
8819 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8820 if (pos < 0) break;
8821 hit = subptr + pos;
8822 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8823 if (hit != adjusted) {
8824 subptr = adjusted;
8825 continue;
8826 }
8827 subend = hit += rslen;
8828 if (chomp) {
8829 if (rsnewline) {
8830 subend = chomp_newline(subptr, subend, enc);
8831 }
8832 else {
8833 subend -= rslen;
8834 }
8835 }
8836 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8837 if (ENUM_ELEM(ary, line)) {
8838 str_mod_check(str, ptr, len);
8839 }
8840 subptr = hit;
8841 }
8842
8843 if (subptr != pend) {
8844 if (chomp) {
8845 if (rsnewline) {
8846 pend = chomp_newline(subptr, pend, enc);
8847 }
8848 else if (pend - subptr >= rslen &&
8849 memcmp(pend - rslen, rsptr, rslen) == 0) {
8850 pend -= rslen;
8851 }
8852 }
8853 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8854 ENUM_ELEM(ary, line);
8855 RB_GC_GUARD(str);
8856 }
8857
8858 end:
8859 if (ary)
8860 return ary;
8861 else
8862 return orig;
8863}
8864
8865/*
8866 * call-seq:
8867 * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8868 * str.each_line(separator=$/, chomp: false) -> an_enumerator
8869 *
8870 * Splits <i>str</i> using the supplied parameter as the record
8871 * separator (<code>$/</code> by default), passing each substring in
8872 * turn to the supplied block. If a zero-length record separator is
8873 * supplied, the string is split into paragraphs delimited by
8874 * multiple successive newlines.
8875 *
8876 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8877 * line.
8878 *
8879 * If no block is given, an enumerator is returned instead.
8880 *
8881 * "hello\nworld".each_line {|s| p s}
8882 * # prints:
8883 * # "hello\n"
8884 * # "world"
8885 *
8886 * "hello\nworld".each_line('l') {|s| p s}
8887 * # prints:
8888 * # "hel"
8889 * # "l"
8890 * # "o\nworl"
8891 * # "d"
8892 *
8893 * "hello\n\n\nworld".each_line('') {|s| p s}
8894 * # prints
8895 * # "hello\n\n"
8896 * # "world"
8897 *
8898 * "hello\nworld".each_line(chomp: true) {|s| p s}
8899 * # prints:
8900 * # "hello"
8901 * # "world"
8902 *
8903 * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8904 * # prints:
8905 * # "he"
8906 * # ""
8907 * # "o\nwor"
8908 * # "d"
8909 *
8910 */
8911
8912static VALUE
8913rb_str_each_line(int argc, VALUE *argv, VALUE str)
8914{
8915 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8916 return rb_str_enumerate_lines(argc, argv, str, 0);
8917}
8918
8919/*
8920 * call-seq:
8921 * str.lines(separator=$/, chomp: false) -> an_array
8922 *
8923 * Returns an array of lines in <i>str</i> split using the supplied
8924 * record separator (<code>$/</code> by default). This is a
8925 * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8926 *
8927 * If +chomp+ is +true+, +separator+ will be removed from the end of each
8928 * line.
8929 *
8930 * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8931 * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8932 * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8933 *
8934 * If a block is given, which is a deprecated form, works the same as
8935 * <code>each_line</code>.
8936 */
8937
8938static VALUE
8939rb_str_lines(int argc, VALUE *argv, VALUE str)
8940{
8941 VALUE ary = WANTARRAY("lines", 0);
8942 return rb_str_enumerate_lines(argc, argv, str, ary);
8943}
8944
8945static VALUE
8946rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8947{
8948 return LONG2FIX(RSTRING_LEN(str));
8949}
8950
8951static VALUE
8952rb_str_enumerate_bytes(VALUE str, VALUE ary)
8953{
8954 long i;
8955
8956 for (i=0; i<RSTRING_LEN(str); i++) {
8957 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8958 }
8959 if (ary)
8960 return ary;
8961 else
8962 return str;
8963}
8964
8965/*
8966 * call-seq:
8967 * str.each_byte {|integer| block } -> str
8968 * str.each_byte -> an_enumerator
8969 *
8970 * Passes each byte in <i>str</i> to the given block, or returns an
8971 * enumerator if no block is given.
8972 *
8973 * "hello".each_byte {|c| print c, ' ' }
8974 *
8975 * <em>produces:</em>
8976 *
8977 * 104 101 108 108 111
8978 */
8979
8980static VALUE
8981rb_str_each_byte(VALUE str)
8982{
8983 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8984 return rb_str_enumerate_bytes(str, 0);
8985}
8986
8987/*
8988 * call-seq:
8989 * str.bytes -> an_array
8990 *
8991 * Returns an array of bytes in <i>str</i>. This is a shorthand for
8992 * <code>str.each_byte.to_a</code>.
8993 *
8994 * If a block is given, which is a deprecated form, works the same as
8995 * <code>each_byte</code>.
8996 */
8997
8998static VALUE
8999rb_str_bytes(VALUE str)
9000{
9001 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9002 return rb_str_enumerate_bytes(str, ary);
9003}
9004
9005static VALUE
9006rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9007{
9008 return rb_str_length(str);
9009}
9010
9011static VALUE
9012rb_str_enumerate_chars(VALUE str, VALUE ary)
9013{
9014 VALUE orig = str;
9015 long i, len, n;
9016 const char *ptr;
9017 rb_encoding *enc;
9018
9019 str = rb_str_new_frozen(str);
9020 ptr = RSTRING_PTR(str);
9021 len = RSTRING_LEN(str);
9022 enc = rb_enc_get(str);
9023
9025 for (i = 0; i < len; i += n) {
9026 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9027 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9028 }
9029 }
9030 else {
9031 for (i = 0; i < len; i += n) {
9032 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9033 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9034 }
9035 }
9036 RB_GC_GUARD(str);
9037 if (ary)
9038 return ary;
9039 else
9040 return orig;
9041}
9042
9043/*
9044 * call-seq:
9045 * str.each_char {|cstr| block } -> str
9046 * str.each_char -> an_enumerator
9047 *
9048 * Passes each character in <i>str</i> to the given block, or returns
9049 * an enumerator if no block is given.
9050 *
9051 * "hello".each_char {|c| print c, ' ' }
9052 *
9053 * <em>produces:</em>
9054 *
9055 * h e l l o
9056 */
9057
9058static VALUE
9059rb_str_each_char(VALUE str)
9060{
9061 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9062 return rb_str_enumerate_chars(str, 0);
9063}
9064
9065/*
9066 * call-seq:
9067 * str.chars -> an_array
9068 *
9069 * Returns an array of characters in <i>str</i>. This is a shorthand
9070 * for <code>str.each_char.to_a</code>.
9071 *
9072 * If a block is given, which is a deprecated form, works the same as
9073 * <code>each_char</code>.
9074 */
9075
9076static VALUE
9077rb_str_chars(VALUE str)
9078{
9079 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9080 return rb_str_enumerate_chars(str, ary);
9081}
9082
9083static VALUE
9084rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9085{
9086 VALUE orig = str;
9087 int n;
9088 unsigned int c;
9089 const char *ptr, *end;
9090 rb_encoding *enc;
9091
9092 if (single_byte_optimizable(str))
9093 return rb_str_enumerate_bytes(str, ary);
9094
9095 str = rb_str_new_frozen(str);
9096 ptr = RSTRING_PTR(str);
9097 end = RSTRING_END(str);
9098 enc = STR_ENC_GET(str);
9099
9100 while (ptr < end) {
9101 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9102 ENUM_ELEM(ary, UINT2NUM(c));
9103 ptr += n;
9104 }
9105 RB_GC_GUARD(str);
9106 if (ary)
9107 return ary;
9108 else
9109 return orig;
9110}
9111
9112/*
9113 * call-seq:
9114 * str.each_codepoint {|integer| block } -> str
9115 * str.each_codepoint -> an_enumerator
9116 *
9117 * Passes the Integer ordinal of each character in <i>str</i>,
9118 * also known as a <i>codepoint</i> when applied to Unicode strings to the
9119 * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9120 * values are directly derived from the binary representation
9121 * of each character.
9122 *
9123 * If no block is given, an enumerator is returned instead.
9124 *
9125 * "hello\u0639".each_codepoint {|c| print c, ' ' }
9126 *
9127 * <em>produces:</em>
9128 *
9129 * 104 101 108 108 111 1593
9130 */
9131
9132static VALUE
9133rb_str_each_codepoint(VALUE str)
9134{
9135 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9136 return rb_str_enumerate_codepoints(str, 0);
9137}
9138
9139/*
9140 * call-seq:
9141 * str.codepoints -> an_array
9142 *
9143 * Returns an array of the Integer ordinals of the
9144 * characters in <i>str</i>. This is a shorthand for
9145 * <code>str.each_codepoint.to_a</code>.
9146 *
9147 * If a block is given, which is a deprecated form, works the same as
9148 * <code>each_codepoint</code>.
9149 */
9150
9151static VALUE
9152rb_str_codepoints(VALUE str)
9153{
9154 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9155 return rb_str_enumerate_codepoints(str, ary);
9156}
9157
9158static regex_t *
9159get_reg_grapheme_cluster(rb_encoding *enc)
9160{
9161 int encidx = rb_enc_to_index(enc);
9162 regex_t *reg_grapheme_cluster = NULL;
9163 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9164
9165 /* synchronize */
9166 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9167 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9168 }
9169 if (!reg_grapheme_cluster) {
9170 const OnigUChar source_ascii[] = "\\X";
9171 OnigErrorInfo einfo;
9172 const OnigUChar *source = source_ascii;
9173 size_t source_len = sizeof(source_ascii) - 1;
9174 switch (encidx) {
9175#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9176#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9177#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9178#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9179#define CASE_UTF(e) \
9180 case ENCINDEX_UTF_##e: { \
9181 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9182 source = source_UTF_##e; \
9183 source_len = sizeof(source_UTF_##e); \
9184 break; \
9185 }
9186 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9187#undef CASE_UTF
9188#undef CHARS_16BE
9189#undef CHARS_16LE
9190#undef CHARS_32BE
9191#undef CHARS_32LE
9192 }
9193 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9194 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9195 if (r) {
9196 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9197 onig_error_code_to_str(message, r, &einfo);
9198 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9199 }
9200 if (encidx == rb_utf8_encindex()) {
9201 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9202 }
9203 }
9204 return reg_grapheme_cluster;
9205}
9206
9207static VALUE
9208rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9209{
9210 size_t grapheme_cluster_count = 0;
9211 regex_t *reg_grapheme_cluster = NULL;
9212 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9213 const char *ptr, *end;
9214
9215 if (!rb_enc_unicode_p(enc)) {
9216 return rb_str_length(str);
9217 }
9218
9219 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9220 ptr = RSTRING_PTR(str);
9221 end = RSTRING_END(str);
9222
9223 while (ptr < end) {
9224 OnigPosition len = onig_match(reg_grapheme_cluster,
9225 (const OnigUChar *)ptr, (const OnigUChar *)end,
9226 (const OnigUChar *)ptr, NULL, 0);
9227 if (len <= 0) break;
9228 grapheme_cluster_count++;
9229 ptr += len;
9230 }
9231
9232 return SIZET2NUM(grapheme_cluster_count);
9233}
9234
9235static VALUE
9236rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9237{
9238 VALUE orig = str;
9239 regex_t *reg_grapheme_cluster = NULL;
9240 rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
9241 const char *ptr0, *ptr, *end;
9242
9243 if (!rb_enc_unicode_p(enc)) {
9244 return rb_str_enumerate_chars(str, ary);
9245 }
9246
9247 if (!ary) str = rb_str_new_frozen(str);
9248 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9249 ptr0 = ptr = RSTRING_PTR(str);
9250 end = RSTRING_END(str);
9251
9252 while (ptr < end) {
9253 OnigPosition len = onig_match(reg_grapheme_cluster,
9254 (const OnigUChar *)ptr, (const OnigUChar *)end,
9255 (const OnigUChar *)ptr, NULL, 0);
9256 if (len <= 0) break;
9257 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9258 ptr += len;
9259 }
9260 RB_GC_GUARD(str);
9261 if (ary)
9262 return ary;
9263 else
9264 return orig;
9265}
9266
9267/*
9268 * call-seq:
9269 * str.each_grapheme_cluster {|cstr| block } -> str
9270 * str.each_grapheme_cluster -> an_enumerator
9271 *
9272 * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9273 * an enumerator if no block is given.
9274 * Unlike String#each_char, this enumerates by grapheme clusters defined by
9275 * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9276 *
9277 * "a\u0300".each_char.to_a.size #=> 2
9278 * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9279 *
9280 */
9281
9282static VALUE
9283rb_str_each_grapheme_cluster(VALUE str)
9284{
9285 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9286 return rb_str_enumerate_grapheme_clusters(str, 0);
9287}
9288
9289/*
9290 * call-seq:
9291 * str.grapheme_clusters -> an_array
9292 *
9293 * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9294 * for <code>str.each_grapheme_cluster.to_a</code>.
9295 *
9296 * If a block is given, which is a deprecated form, works the same as
9297 * <code>each_grapheme_cluster</code>.
9298 */
9299
9300static VALUE
9301rb_str_grapheme_clusters(VALUE str)
9302{
9303 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9304 return rb_str_enumerate_grapheme_clusters(str, ary);
9305}
9306
9307static long
9308chopped_length(VALUE str)
9309{
9310 rb_encoding *enc = STR_ENC_GET(str);
9311 const char *p, *p2, *beg, *end;
9312
9313 beg = RSTRING_PTR(str);
9314 end = beg + RSTRING_LEN(str);
9315 if (beg >= end) return 0;
9316 p = rb_enc_prev_char(beg, end, end, enc);
9317 if (!p) return 0;
9318 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9319 p2 = rb_enc_prev_char(beg, p, end, enc);
9320 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9321 }
9322 return p - beg;
9323}
9324
9325/*
9326 * call-seq:
9327 * str.chop! -> str or nil
9328 *
9329 * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9330 * <code>nil</code> if <i>str</i> is the empty string. See also
9331 * String#chomp!.
9332 */
9333
9334static VALUE
9335rb_str_chop_bang(VALUE str)
9336{
9337 str_modify_keep_cr(str);
9338 if (RSTRING_LEN(str) > 0) {
9339 long len;
9340 len = chopped_length(str);
9341 STR_SET_LEN(str, len);
9342 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9343 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9345 }
9346 return str;
9347 }
9348 return Qnil;
9349}
9350
9351
9352/*
9353 * call-seq:
9354 * str.chop -> new_str
9355 *
9356 * Returns a new String with the last character removed. If the
9357 * string ends with <code>\r\n</code>, both characters are
9358 * removed. Applying <code>chop</code> to an empty string returns an
9359 * empty string. String#chomp is often a safer alternative, as it
9360 * leaves the string unchanged if it doesn't end in a record
9361 * separator.
9362 *
9363 * "string\r\n".chop #=> "string"
9364 * "string\n\r".chop #=> "string\n"
9365 * "string\n".chop #=> "string"
9366 * "string".chop #=> "strin"
9367 * "x".chop.chop #=> ""
9368 */
9369
9370static VALUE
9371rb_str_chop(VALUE str)
9372{
9373 return rb_str_subseq(str, 0, chopped_length(str));
9374}
9375
9376static long
9377smart_chomp(VALUE str, const char *e, const char *p)
9378{
9379 rb_encoding *enc = rb_enc_get(str);
9380 if (rb_enc_mbminlen(enc) > 1) {
9381 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9382 if (rb_enc_is_newline(pp, e, enc)) {
9383 e = pp;
9384 }
9385 pp = e - rb_enc_mbminlen(enc);
9386 if (pp >= p) {
9387 pp = rb_enc_left_char_head(p, pp, e, enc);
9388 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9389 e = pp;
9390 }
9391 }
9392 }
9393 else {
9394 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9395 case '\n':
9396 if (--e > p && *(e-1) == '\r') {
9397 --e;
9398 }
9399 break;
9400 case '\r':
9401 --e;
9402 break;
9403 }
9404 }
9405 return e - p;
9406}
9407
9408static long
9409chompped_length(VALUE str, VALUE rs)
9410{
9411 rb_encoding *enc;
9412 int newline;
9413 char *pp, *e, *rsptr;
9414 long rslen;
9415 char *const p = RSTRING_PTR(str);
9416 long len = RSTRING_LEN(str);
9417
9418 if (len == 0) return 0;
9419 e = p + len;
9420 if (rs == rb_default_rs) {
9421 return smart_chomp(str, e, p);
9422 }
9423
9424 enc = rb_enc_get(str);
9425 RSTRING_GETMEM(rs, rsptr, rslen);
9426 if (rslen == 0) {
9427 if (rb_enc_mbminlen(enc) > 1) {
9428 while (e > p) {
9429 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9430 if (!rb_enc_is_newline(pp, e, enc)) break;
9431 e = pp;
9432 pp -= rb_enc_mbminlen(enc);
9433 if (pp >= p) {
9434 pp = rb_enc_left_char_head(p, pp, e, enc);
9435 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9436 e = pp;
9437 }
9438 }
9439 }
9440 }
9441 else {
9442 while (e > p && *(e-1) == '\n') {
9443 --e;
9444 if (e > p && *(e-1) == '\r')
9445 --e;
9446 }
9447 }
9448 return e - p;
9449 }
9450 if (rslen > len) return len;
9451
9452 enc = rb_enc_get(rs);
9453 newline = rsptr[rslen-1];
9454 if (rslen == rb_enc_mbminlen(enc)) {
9455 if (rslen == 1) {
9456 if (newline == '\n')
9457 return smart_chomp(str, e, p);
9458 }
9459 else {
9460 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9461 return smart_chomp(str, e, p);
9462 }
9463 }
9464
9465 enc = rb_enc_check(str, rs);
9466 if (is_broken_string(rs)) {
9467 return len;
9468 }
9469 pp = e - rslen;
9470 if (p[len-1] == newline &&
9471 (rslen <= 1 ||
9472 memcmp(rsptr, pp, rslen) == 0)) {
9473 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9474 return len - rslen;
9475 RB_GC_GUARD(rs);
9476 }
9477 return len;
9478}
9479
9485static VALUE
9486chomp_rs(int argc, const VALUE *argv)
9487{
9488 rb_check_arity(argc, 0, 1);
9489 if (argc > 0) {
9490 VALUE rs = argv[0];
9491 if (!NIL_P(rs)) StringValue(rs);
9492 return rs;
9493 }
9494 else {
9495 return rb_rs;
9496 }
9497}
9498
9499VALUE
9500rb_str_chomp_string(VALUE str, VALUE rs)
9501{
9502 long olen = RSTRING_LEN(str);
9503 long len = chompped_length(str, rs);
9504 if (len >= olen) return Qnil;
9505 str_modify_keep_cr(str);
9506 STR_SET_LEN(str, len);
9507 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9508 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9510 }
9511 return str;
9512}
9513
9514/*
9515 * call-seq:
9516 * str.chomp!(separator=$/) -> str or nil
9517 *
9518 * Modifies <i>str</i> in place as described for String#chomp,
9519 * returning <i>str</i>, or <code>nil</code> if no modifications were
9520 * made.
9521 */
9522
9523static VALUE
9524rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9525{
9526 VALUE rs;
9527 str_modifiable(str);
9528 if (RSTRING_LEN(str) == 0) return Qnil;
9529 rs = chomp_rs(argc, argv);
9530 if (NIL_P(rs)) return Qnil;
9531 return rb_str_chomp_string(str, rs);
9532}
9533
9534
9535/*
9536 * call-seq:
9537 * str.chomp(separator=$/) -> new_str
9538 *
9539 * Returns a new String with the given record separator removed
9540 * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9541 * changed from the default Ruby record separator, then <code>chomp</code> also
9542 * removes carriage return characters (that is, it will remove <code>\n</code>,
9543 * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9544 * it will remove all trailing newlines from the string.
9545 *
9546 * "hello".chomp #=> "hello"
9547 * "hello\n".chomp #=> "hello"
9548 * "hello\r\n".chomp #=> "hello"
9549 * "hello\n\r".chomp #=> "hello\n"
9550 * "hello\r".chomp #=> "hello"
9551 * "hello \n there".chomp #=> "hello \n there"
9552 * "hello".chomp("llo") #=> "he"
9553 * "hello\r\n\r\n".chomp('') #=> "hello"
9554 * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9555 */
9556
9557static VALUE
9558rb_str_chomp(int argc, VALUE *argv, VALUE str)
9559{
9560 VALUE rs = chomp_rs(argc, argv);
9561 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9562 return rb_str_subseq(str, 0, chompped_length(str, rs));
9563}
9564
9565static long
9566lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9567{
9568 const char *const start = s;
9569
9570 if (!s || s >= e) return 0;
9571
9572 /* remove spaces at head */
9573 if (single_byte_optimizable(str)) {
9574 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9575 }
9576 else {
9577 while (s < e) {
9578 int n;
9579 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9580
9581 if (cc && !rb_isspace(cc)) break;
9582 s += n;
9583 }
9584 }
9585 return s - start;
9586}
9587
9588/*
9589 * call-seq:
9590 * str.lstrip! -> self or nil
9591 *
9592 * Removes leading whitespace from the receiver.
9593 * Returns the altered receiver, or +nil+ if no change was made.
9594 * See also String#rstrip! and String#strip!.
9595 *
9596 * Refer to String#strip for the definition of whitespace.
9597 *
9598 * " hello ".lstrip! #=> "hello "
9599 * "hello ".lstrip! #=> nil
9600 * "hello".lstrip! #=> nil
9601 */
9602
9603static VALUE
9604rb_str_lstrip_bang(VALUE str)
9605{
9606 rb_encoding *enc;
9607 char *start, *s;
9608 long olen, loffset;
9609
9610 str_modify_keep_cr(str);
9611 enc = STR_ENC_GET(str);
9612 RSTRING_GETMEM(str, start, olen);
9613 loffset = lstrip_offset(str, start, start+olen, enc);
9614 if (loffset > 0) {
9615 long len = olen-loffset;
9616 s = start + loffset;
9617 memmove(start, s, len);
9618 STR_SET_LEN(str, len);
9619 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9620 return str;
9621 }
9622 return Qnil;
9623}
9624
9625
9626/*
9627 * call-seq:
9628 * str.lstrip -> new_str
9629 *
9630 * Returns a copy of the receiver with leading whitespace removed.
9631 * See also String#rstrip and String#strip.
9632 *
9633 * Refer to String#strip for the definition of whitespace.
9634 *
9635 * " hello ".lstrip #=> "hello "
9636 * "hello".lstrip #=> "hello"
9637 */
9638
9639static VALUE
9640rb_str_lstrip(VALUE str)
9641{
9642 char *start;
9643 long len, loffset;
9644 RSTRING_GETMEM(str, start, len);
9645 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9646 if (loffset <= 0) return str_duplicate(rb_cString, str);
9647 return rb_str_subseq(str, loffset, len - loffset);
9648}
9649
9650static long
9651rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9652{
9653 const char *t;
9654
9655 rb_str_check_dummy_enc(enc);
9656 if (!s || s >= e) return 0;
9657 t = e;
9658
9659 /* remove trailing spaces or '\0's */
9660 if (single_byte_optimizable(str)) {
9661 unsigned char c;
9662 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9663 }
9664 else {
9665 char *tp;
9666
9667 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9668 unsigned int c = rb_enc_codepoint(tp, e, enc);
9669 if (c && !rb_isspace(c)) break;
9670 t = tp;
9671 }
9672 }
9673 return e - t;
9674}
9675
9676/*
9677 * call-seq:
9678 * str.rstrip! -> self or nil
9679 *
9680 * Removes trailing whitespace from the receiver.
9681 * Returns the altered receiver, or +nil+ if no change was made.
9682 * See also String#lstrip! and String#strip!.
9683 *
9684 * Refer to String#strip for the definition of whitespace.
9685 *
9686 * " hello ".rstrip! #=> " hello"
9687 * " hello".rstrip! #=> nil
9688 * "hello".rstrip! #=> nil
9689 */
9690
9691static VALUE
9692rb_str_rstrip_bang(VALUE str)
9693{
9694 rb_encoding *enc;
9695 char *start;
9696 long olen, roffset;
9697
9698 str_modify_keep_cr(str);
9699 enc = STR_ENC_GET(str);
9700 RSTRING_GETMEM(str, start, olen);
9701 roffset = rstrip_offset(str, start, start+olen, enc);
9702 if (roffset > 0) {
9703 long len = olen - roffset;
9704
9705 STR_SET_LEN(str, len);
9706 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9707 return str;
9708 }
9709 return Qnil;
9710}
9711
9712
9713/*
9714 * call-seq:
9715 * str.rstrip -> new_str
9716 *
9717 * Returns a copy of the receiver with trailing whitespace removed.
9718 * See also String#lstrip and String#strip.
9719 *
9720 * Refer to String#strip for the definition of whitespace.
9721 *
9722 * " hello ".rstrip #=> " hello"
9723 * "hello".rstrip #=> "hello"
9724 */
9725
9726static VALUE
9727rb_str_rstrip(VALUE str)
9728{
9729 rb_encoding *enc;
9730 char *start;
9731 long olen, roffset;
9732
9733 enc = STR_ENC_GET(str);
9734 RSTRING_GETMEM(str, start, olen);
9735 roffset = rstrip_offset(str, start, start+olen, enc);
9736
9737 if (roffset <= 0) return str_duplicate(rb_cString, str);
9738 return rb_str_subseq(str, 0, olen-roffset);
9739}
9740
9741
9742/*
9743 * call-seq:
9744 * str.strip! -> self or nil
9745 *
9746 * Removes leading and trailing whitespace from the receiver.
9747 * Returns the altered receiver, or +nil+ if there was no change.
9748 *
9749 * Refer to String#strip for the definition of whitespace.
9750 *
9751 * " hello ".strip! #=> "hello"
9752 * "hello".strip! #=> nil
9753 */
9754
9755static VALUE
9756rb_str_strip_bang(VALUE str)
9757{
9758 char *start;
9759 long olen, loffset, roffset;
9760 rb_encoding *enc;
9761
9762 str_modify_keep_cr(str);
9763 enc = STR_ENC_GET(str);
9764 RSTRING_GETMEM(str, start, olen);
9765 loffset = lstrip_offset(str, start, start+olen, enc);
9766 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9767
9768 if (loffset > 0 || roffset > 0) {
9769 long len = olen-roffset;
9770 if (loffset > 0) {
9771 len -= loffset;
9772 memmove(start, start + loffset, len);
9773 }
9774 STR_SET_LEN(str, len);
9775 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9776 return str;
9777 }
9778 return Qnil;
9779}
9780
9781
9782/*
9783 * call-seq:
9784 * str.strip -> new_str
9785 *
9786 * Returns a copy of the receiver with leading and trailing whitespace removed.
9787 *
9788 * Whitespace is defined as any of the following characters:
9789 * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9790 *
9791 * " hello ".strip #=> "hello"
9792 * "\tgoodbye\r\n".strip #=> "goodbye"
9793 * "\x00\t\n\v\f\r ".strip #=> ""
9794 * "hello".strip #=> "hello"
9795 */
9796
9797static VALUE
9798rb_str_strip(VALUE str)
9799{
9800 char *start;
9801 long olen, loffset, roffset;
9802 rb_encoding *enc = STR_ENC_GET(str);
9803
9804 RSTRING_GETMEM(str, start, olen);
9805 loffset = lstrip_offset(str, start, start+olen, enc);
9806 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9807
9808 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9809 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9810}
9811
9812static VALUE
9813scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9814{
9815 VALUE result, match;
9816 struct re_registers *regs;
9817 int i;
9818 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9819 if (pos >= 0) {
9820 if (BUILTIN_TYPE(pat) == T_STRING) {
9821 regs = NULL;
9822 end = pos + RSTRING_LEN(pat);
9823 }
9824 else {
9825 match = rb_backref_get();
9826 regs = RMATCH_REGS(match);
9827 pos = BEG(0);
9828 end = END(0);
9829 }
9830 if (pos == end) {
9831 rb_encoding *enc = STR_ENC_GET(str);
9832 /*
9833 * Always consume at least one character of the input string
9834 */
9835 if (RSTRING_LEN(str) > end)
9836 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9837 RSTRING_END(str), enc);
9838 else
9839 *start = end + 1;
9840 }
9841 else {
9842 *start = end;
9843 }
9844 if (!regs || regs->num_regs == 1) {
9845 result = rb_str_subseq(str, pos, end - pos);
9846 return result;
9847 }
9848 result = rb_ary_new2(regs->num_regs);
9849 for (i=1; i < regs->num_regs; i++) {
9850 VALUE s = Qnil;
9851 if (BEG(i) >= 0) {
9852 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9853 }
9854 rb_ary_push(result, s);
9855 }
9856
9857 return result;
9858 }
9859 return Qnil;
9860}
9861
9862
9863/*
9864 * call-seq:
9865 * str.scan(pattern) -> array
9866 * str.scan(pattern) {|match, ...| block } -> str
9867 *
9868 * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9869 * Regexp or a String). For each match, a result is
9870 * generated and either added to the result array or passed to the block. If
9871 * the pattern contains no groups, each individual result consists of the
9872 * matched string, <code>$&</code>. If the pattern contains groups, each
9873 * individual result is itself an array containing one entry per group.
9874 *
9875 * a = "cruel world"
9876 * a.scan(/\w+/) #=> ["cruel", "world"]
9877 * a.scan(/.../) #=> ["cru", "el ", "wor"]
9878 * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9879 * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9880 *
9881 * And the block form:
9882 *
9883 * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9884 * print "\n"
9885 * a.scan(/(.)(.)/) {|x,y| print y, x }
9886 * print "\n"
9887 *
9888 * <em>produces:</em>
9889 *
9890 * <<cruel>> <<world>>
9891 * rceu lowlr
9892 */
9893
9894static VALUE
9895rb_str_scan(VALUE str, VALUE pat)
9896{
9897 VALUE result;
9898 long start = 0;
9899 long last = -1, prev = 0;
9900 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9901
9902 pat = get_pat_quoted(pat, 1);
9903 mustnot_broken(str);
9904 if (!rb_block_given_p()) {
9905 VALUE ary = rb_ary_new();
9906
9907 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9908 last = prev;
9909 prev = start;
9910 rb_ary_push(ary, result);
9911 }
9912 if (last >= 0) rb_pat_search(pat, str, last, 1);
9913 else rb_backref_set(Qnil);
9914 return ary;
9915 }
9916
9917 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9918 last = prev;
9919 prev = start;
9920 rb_yield(result);
9921 str_mod_check(str, p, len);
9922 }
9923 if (last >= 0) rb_pat_search(pat, str, last, 1);
9924 return str;
9925}
9926
9927
9928/*
9929 * call-seq:
9930 * str.hex -> integer
9931 *
9932 * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9933 * (with an optional sign and an optional <code>0x</code>) and returns the
9934 * corresponding number. Zero is returned on error.
9935 *
9936 * "0x0a".hex #=> 10
9937 * "-1234".hex #=> -4660
9938 * "0".hex #=> 0
9939 * "wombat".hex #=> 0
9940 */
9941
9942static VALUE
9943rb_str_hex(VALUE str)
9944{
9945 return rb_str_to_inum(str, 16, FALSE);
9946}
9947
9948
9949/*
9950 * call-seq:
9951 * str.oct -> integer
9952 *
9953 * Treats leading characters of <i>str</i> as a string of octal digits (with an
9954 * optional sign) and returns the corresponding number. Returns 0 if the
9955 * conversion fails.
9956 *
9957 * "123".oct #=> 83
9958 * "-377".oct #=> -255
9959 * "bad".oct #=> 0
9960 * "0377bad".oct #=> 255
9961 *
9962 * If +str+ starts with <code>0</code>, radix indicators are honored.
9963 * See Kernel#Integer.
9964 */
9965
9966static VALUE
9967rb_str_oct(VALUE str)
9968{
9969 return rb_str_to_inum(str, -8, FALSE);
9970}
9971
9972#ifndef HAVE_CRYPT_R
9973# include "ruby/thread_native.h"
9974# include "ruby/atomic.h"
9975
9976static struct {
9977 rb_atomic_t initialized;
9978 rb_nativethread_lock_t lock;
9979} crypt_mutex;
9980
9981static void
9982crypt_mutex_destroy(void)
9983{
9984 RUBY_ASSERT_ALWAYS(crypt_mutex.initialized == 1);
9985 rb_nativethread_lock_destroy(&crypt_mutex.lock);
9986 crypt_mutex.initialized = 0;
9987}
9988
9989static void
9990crypt_mutex_initialize(void)
9991{
9992 rb_atomic_t i;
9993 while ((i = RUBY_ATOMIC_CAS(crypt_mutex.initialized, 0, 2)) == 2);
9994 switch (i) {
9995 case 0:
9996 rb_nativethread_lock_initialize(&crypt_mutex.lock);
9997 atexit(crypt_mutex_destroy);
9998 RUBY_ASSERT(crypt_mutex.initialized == 2);
9999 RUBY_ATOMIC_CAS(crypt_mutex.initialized, 2, 1);
10000 break;
10001 case 1:
10002 break;
10003 default:
10004 rb_bug("crypt_mutex.initialized: %d->%d", i, crypt_mutex.initialized);
10005 }
10006}
10007#endif
10008
10009/*
10010 * call-seq:
10011 * str.crypt(salt_str) -> new_str
10012 *
10013 * Returns the string generated by calling <code>crypt(3)</code>
10014 * standard library function with <code>str</code> and
10015 * <code>salt_str</code>, in this order, as its arguments. Please do
10016 * not use this method any longer. It is legacy; provided only for
10017 * backward compatibility with ruby scripts in earlier days. It is
10018 * bad to use in contemporary programs for several reasons:
10019 *
10020 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10021 * run. The generated string lacks data portability.
10022 *
10023 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10024 * (i.e. silently ends up in unexpected results).
10025 *
10026 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10027 * thread safe.
10028 *
10029 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10030 * very very weak. According to its manpage, Linux's traditional
10031 * <code>crypt(3)</code> output has only 2**56 variations; too
10032 * easy to brute force today. And this is the default behaviour.
10033 *
10034 * * In order to make things robust some OSes implement so-called
10035 * "modular" usage. To go through, you have to do a complex
10036 * build-up of the <code>salt_str</code> parameter, by hand.
10037 * Failure in generation of a proper salt string tends not to
10038 * yield any errors; typos in parameters are normally not
10039 * detectable.
10040 *
10041 * * For instance, in the following example, the second invocation
10042 * of String#crypt is wrong; it has a typo in "round=" (lacks
10043 * "s"). However the call does not fail and something unexpected
10044 * is generated.
10045 *
10046 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10047 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10048 *
10049 * * Even in the "modular" mode, some hash functions are considered
10050 * archaic and no longer recommended at all; for instance module
10051 * <code>$1$</code> is officially abandoned by its author: see
10052 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10053 * instance module <code>$3$</code> is considered completely
10054 * broken: see the manpage of FreeBSD.
10055 *
10056 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10057 * written above, <code>crypt(3)</code> on Mac OS never fails.
10058 * This means even if you build up a proper salt string it
10059 * generates a traditional DES hash anyways, and there is no way
10060 * for you to be aware of.
10061 *
10062 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10063 *
10064 * If for some reason you cannot migrate to other secure contemporary
10065 * password hashing algorithms, install the string-crypt gem and
10066 * <code>require 'string/crypt'</code> to continue using it.
10067 */
10068
10069static VALUE
10070rb_str_crypt(VALUE str, VALUE salt)
10071{
10072#ifdef HAVE_CRYPT_R
10073 VALUE databuf;
10074 struct crypt_data *data;
10075# define CRYPT_END() ALLOCV_END(databuf)
10076#else
10077 extern char *crypt(const char *, const char *);
10078# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10079#endif
10080 VALUE result;
10081 const char *s, *saltp;
10082 char *res;
10083#ifdef BROKEN_CRYPT
10084 char salt_8bit_clean[3];
10085#endif
10086
10087 StringValue(salt);
10088 mustnot_wchar(str);
10089 mustnot_wchar(salt);
10090 s = StringValueCStr(str);
10091 saltp = RSTRING_PTR(salt);
10092 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10093 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10094 }
10095
10096#ifdef BROKEN_CRYPT
10097 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10098 salt_8bit_clean[0] = saltp[0] & 0x7f;
10099 salt_8bit_clean[1] = saltp[1] & 0x7f;
10100 salt_8bit_clean[2] = '\0';
10101 saltp = salt_8bit_clean;
10102 }
10103#endif
10104#ifdef HAVE_CRYPT_R
10105 data = ALLOCV(databuf, sizeof(struct crypt_data));
10106# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10107 data->initialized = 0;
10108# endif
10109 res = crypt_r(s, saltp, data);
10110#else
10111 crypt_mutex_initialize();
10112 rb_nativethread_lock_lock(&crypt_mutex.lock);
10113 res = crypt(s, saltp);
10114#endif
10115 if (!res) {
10116 int err = errno;
10117 CRYPT_END();
10118 rb_syserr_fail(err, "crypt");
10119 }
10120 result = rb_str_new_cstr(res);
10121 CRYPT_END();
10122 return result;
10123}
10124
10125
10126/*
10127 * call-seq:
10128 * str.ord -> integer
10129 *
10130 * Returns the Integer ordinal of a one-character string.
10131 *
10132 * "a".ord #=> 97
10133 */
10134
10135static VALUE
10136rb_str_ord(VALUE s)
10137{
10138 unsigned int c;
10139
10140 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10141 return UINT2NUM(c);
10142}
10143/*
10144 * call-seq:
10145 * str.sum(n=16) -> integer
10146 *
10147 * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10148 * where <em>n</em> is the optional Integer parameter, defaulting
10149 * to 16. The result is simply the sum of the binary value of each byte in
10150 * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10151 * checksum.
10152 */
10153
10154static VALUE
10155rb_str_sum(int argc, VALUE *argv, VALUE str)
10156{
10157 int bits = 16;
10158 char *ptr, *p, *pend;
10159 long len;
10160 VALUE sum = INT2FIX(0);
10161 unsigned long sum0 = 0;
10162
10163 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10164 bits = 0;
10165 }
10166 ptr = p = RSTRING_PTR(str);
10167 len = RSTRING_LEN(str);
10168 pend = p + len;
10169
10170 while (p < pend) {
10171 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10172 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10173 str_mod_check(str, ptr, len);
10174 sum0 = 0;
10175 }
10176 sum0 += (unsigned char)*p;
10177 p++;
10178 }
10179
10180 if (bits == 0) {
10181 if (sum0) {
10182 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10183 }
10184 }
10185 else {
10186 if (sum == INT2FIX(0)) {
10187 if (bits < (int)sizeof(long)*CHAR_BIT) {
10188 sum0 &= (((unsigned long)1)<<bits)-1;
10189 }
10190 sum = LONG2FIX(sum0);
10191 }
10192 else {
10193 VALUE mod;
10194
10195 if (sum0) {
10196 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10197 }
10198
10199 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10200 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10201 sum = rb_funcall(sum, '&', 1, mod);
10202 }
10203 }
10204 return sum;
10205}
10206
10207static VALUE
10208rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10209{
10210 rb_encoding *enc;
10211 VALUE w;
10212 long width, len, flen = 1, fclen = 1;
10213 VALUE res;
10214 char *p;
10215 const char *f = " ";
10216 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10217 VALUE pad;
10218 int singlebyte = 1, cr;
10219 int termlen;
10220
10221 rb_scan_args(argc, argv, "11", &w, &pad);
10222 enc = STR_ENC_GET(str);
10223 termlen = rb_enc_mbminlen(enc);
10224 width = NUM2LONG(w);
10225 if (argc == 2) {
10226 StringValue(pad);
10227 enc = rb_enc_check(str, pad);
10228 f = RSTRING_PTR(pad);
10229 flen = RSTRING_LEN(pad);
10230 fclen = str_strlen(pad, enc); /* rb_enc_check */
10231 singlebyte = single_byte_optimizable(pad);
10232 if (flen == 0 || fclen == 0) {
10233 rb_raise(rb_eArgError, "zero width padding");
10234 }
10235 }
10236 len = str_strlen(str, enc); /* rb_enc_check */
10237 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10238 n = width - len;
10239 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10240 rlen = n - llen;
10241 cr = ENC_CODERANGE(str);
10242 if (flen > 1) {
10243 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10244 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10245 }
10246 size = RSTRING_LEN(str);
10247 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10248 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10249 (len += llen2 + rlen2) >= LONG_MAX - size) {
10250 rb_raise(rb_eArgError, "argument too big");
10251 }
10252 len += size;
10253 res = str_new0(rb_cString, 0, len, termlen);
10254 p = RSTRING_PTR(res);
10255 if (flen <= 1) {
10256 memset(p, *f, llen);
10257 p += llen;
10258 }
10259 else {
10260 while (llen >= fclen) {
10261 memcpy(p,f,flen);
10262 p += flen;
10263 llen -= fclen;
10264 }
10265 if (llen > 0) {
10266 memcpy(p, f, llen2);
10267 p += llen2;
10268 }
10269 }
10270 memcpy(p, RSTRING_PTR(str), size);
10271 p += size;
10272 if (flen <= 1) {
10273 memset(p, *f, rlen);
10274 p += rlen;
10275 }
10276 else {
10277 while (rlen >= fclen) {
10278 memcpy(p,f,flen);
10279 p += flen;
10280 rlen -= fclen;
10281 }
10282 if (rlen > 0) {
10283 memcpy(p, f, rlen2);
10284 p += rlen2;
10285 }
10286 }
10287 TERM_FILL(p, termlen);
10288 STR_SET_LEN(res, p-RSTRING_PTR(res));
10289 rb_enc_associate(res, enc);
10290 if (argc == 2)
10291 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10292 if (cr != ENC_CODERANGE_BROKEN)
10293 ENC_CODERANGE_SET(res, cr);
10294
10295 RB_GC_GUARD(pad);
10296 return res;
10297}
10298
10299
10300/*
10301 * call-seq:
10302 * str.ljust(integer, padstr=' ') -> new_str
10303 *
10304 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10305 * String of length <i>integer</i> with <i>str</i> left justified
10306 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10307 *
10308 * "hello".ljust(4) #=> "hello"
10309 * "hello".ljust(20) #=> "hello "
10310 * "hello".ljust(20, '1234') #=> "hello123412341234123"
10311 */
10312
10313static VALUE
10314rb_str_ljust(int argc, VALUE *argv, VALUE str)
10315{
10316 return rb_str_justify(argc, argv, str, 'l');
10317}
10318
10319
10320/*
10321 * call-seq:
10322 * str.rjust(integer, padstr=' ') -> new_str
10323 *
10324 * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10325 * String of length <i>integer</i> with <i>str</i> right justified
10326 * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10327 *
10328 * "hello".rjust(4) #=> "hello"
10329 * "hello".rjust(20) #=> " hello"
10330 * "hello".rjust(20, '1234') #=> "123412341234123hello"
10331 */
10332
10333static VALUE
10334rb_str_rjust(int argc, VALUE *argv, VALUE str)
10335{
10336 return rb_str_justify(argc, argv, str, 'r');
10337}
10338
10339
10340/*
10341 * call-seq:
10342 * str.center(width, padstr=' ') -> new_str
10343 *
10344 * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10345 * returns a new String of length +width+ with +str+ centered and padded with
10346 * +padstr+; otherwise, returns +str+.
10347 *
10348 * "hello".center(4) #=> "hello"
10349 * "hello".center(20) #=> " hello "
10350 * "hello".center(20, '123') #=> "1231231hello12312312"
10351 */
10352
10353static VALUE
10354rb_str_center(int argc, VALUE *argv, VALUE str)
10355{
10356 return rb_str_justify(argc, argv, str, 'c');
10357}
10358
10359/*
10360 * call-seq:
10361 * str.partition(sep) -> [head, sep, tail]
10362 * str.partition(regexp) -> [head, match, tail]
10363 *
10364 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10365 * and returns the part before it, the match, and the part
10366 * after it.
10367 * If it is not found, returns two empty strings and <i>str</i>.
10368 *
10369 * "hello".partition("l") #=> ["he", "l", "lo"]
10370 * "hello".partition("x") #=> ["hello", "", ""]
10371 * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10372 */
10373
10374static VALUE
10375rb_str_partition(VALUE str, VALUE sep)
10376{
10377 long pos;
10378
10379 sep = get_pat_quoted(sep, 0);
10380 if (RB_TYPE_P(sep, T_REGEXP)) {
10381 if (rb_reg_search(sep, str, 0, 0) < 0) {
10382 goto failed;
10383 }
10384 VALUE match = rb_backref_get();
10385 struct re_registers *regs = RMATCH_REGS(match);
10386
10387 pos = BEG(0);
10388 sep = rb_str_subseq(str, pos, END(0) - pos);
10389 }
10390 else {
10391 pos = rb_str_index(str, sep, 0);
10392 if (pos < 0) goto failed;
10393 }
10394 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10395 sep,
10396 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10397 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10398
10399 failed:
10400 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10401}
10402
10403/*
10404 * call-seq:
10405 * str.rpartition(sep) -> [head, sep, tail]
10406 * str.rpartition(regexp) -> [head, match, tail]
10407 *
10408 * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10409 * of the string, and returns the part before it, the match, and the part
10410 * after it.
10411 * If it is not found, returns two empty strings and <i>str</i>.
10412 *
10413 * "hello".rpartition("l") #=> ["hel", "l", "o"]
10414 * "hello".rpartition("x") #=> ["", "", "hello"]
10415 * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10416 *
10417 * The match from the end means starting at the possible last position, not
10418 * the last of longest matches.
10419 *
10420 * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10421 *
10422 * To partition at the last longest match, needs to combine with
10423 * negative lookbehind.
10424 *
10425 * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10426 *
10427 * Or String#partition with negative lookforward.
10428 *
10429 * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10430 */
10431
10432static VALUE
10433rb_str_rpartition(VALUE str, VALUE sep)
10434{
10435 long pos = RSTRING_LEN(str);
10436
10437 sep = get_pat_quoted(sep, 0);
10438 if (RB_TYPE_P(sep, T_REGEXP)) {
10439 if (rb_reg_search(sep, str, pos, 1) < 0) {
10440 goto failed;
10441 }
10442 VALUE match = rb_backref_get();
10443 struct re_registers *regs = RMATCH_REGS(match);
10444
10445 pos = BEG(0);
10446 sep = rb_str_subseq(str, pos, END(0) - pos);
10447 }
10448 else {
10449 pos = rb_str_sublen(str, pos);
10450 pos = rb_str_rindex(str, sep, pos);
10451 if (pos < 0) {
10452 goto failed;
10453 }
10454 pos = rb_str_offset(str, pos);
10455 }
10456
10457 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10458 sep,
10459 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10460 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10461 failed:
10462 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10463}
10464
10465/*
10466 * call-seq:
10467 * str.start_with?([prefixes]+) -> true or false
10468 *
10469 * Returns true if +str+ starts with one of the +prefixes+ given.
10470 * Each of the +prefixes+ should be a String or a Regexp.
10471 *
10472 * "hello".start_with?("hell") #=> true
10473 * "hello".start_with?(/H/i) #=> true
10474 *
10475 * # returns true if one of the prefixes matches.
10476 * "hello".start_with?("heaven", "hell") #=> true
10477 * "hello".start_with?("heaven", "paradise") #=> false
10478 */
10479
10480static VALUE
10481rb_str_start_with(int argc, VALUE *argv, VALUE str)
10482{
10483 int i;
10484
10485 for (i=0; i<argc; i++) {
10486 VALUE tmp = argv[i];
10487 if (RB_TYPE_P(tmp, T_REGEXP)) {
10488 if (rb_reg_start_with_p(tmp, str))
10489 return Qtrue;
10490 }
10491 else {
10492 StringValue(tmp);
10493 rb_enc_check(str, tmp);
10494 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10495 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10496 return Qtrue;
10497 }
10498 }
10499 return Qfalse;
10500}
10501
10502/*
10503 * call-seq:
10504 * str.end_with?([suffixes]+) -> true or false
10505 *
10506 * Returns true if +str+ ends with one of the +suffixes+ given.
10507 *
10508 * "hello".end_with?("ello") #=> true
10509 *
10510 * # returns true if one of the +suffixes+ matches.
10511 * "hello".end_with?("heaven", "ello") #=> true
10512 * "hello".end_with?("heaven", "paradise") #=> false
10513 */
10514
10515static VALUE
10516rb_str_end_with(int argc, VALUE *argv, VALUE str)
10517{
10518 int i;
10519 char *p, *s, *e;
10520 rb_encoding *enc;
10521
10522 for (i=0; i<argc; i++) {
10523 VALUE tmp = argv[i];
10524 long slen, tlen;
10525 StringValue(tmp);
10526 enc = rb_enc_check(str, tmp);
10527 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10528 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10529 p = RSTRING_PTR(str);
10530 e = p + slen;
10531 s = e - tlen;
10532 if (rb_enc_left_char_head(p, s, e, enc) != s)
10533 continue;
10534 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10535 return Qtrue;
10536 }
10537 return Qfalse;
10538}
10539
10549static long
10550deleted_prefix_length(VALUE str, VALUE prefix)
10551{
10552 char *strptr, *prefixptr;
10553 long olen, prefixlen;
10554
10555 StringValue(prefix);
10556 if (is_broken_string(prefix)) return 0;
10557 rb_enc_check(str, prefix);
10558
10559 /* return 0 if not start with prefix */
10560 prefixlen = RSTRING_LEN(prefix);
10561 if (prefixlen <= 0) return 0;
10562 olen = RSTRING_LEN(str);
10563 if (olen < prefixlen) return 0;
10564 strptr = RSTRING_PTR(str);
10565 prefixptr = RSTRING_PTR(prefix);
10566 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10567
10568 return prefixlen;
10569}
10570
10571/*
10572 * call-seq:
10573 * str.delete_prefix!(prefix) -> self or nil
10574 *
10575 * Deletes leading <code>prefix</code> from <i>str</i>, returning
10576 * <code>nil</code> if no change was made.
10577 *
10578 * "hello".delete_prefix!("hel") #=> "lo"
10579 * "hello".delete_prefix!("llo") #=> nil
10580 */
10581
10582static VALUE
10583rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10584{
10585 long prefixlen;
10586 str_modify_keep_cr(str);
10587
10588 prefixlen = deleted_prefix_length(str, prefix);
10589 if (prefixlen <= 0) return Qnil;
10590
10591 return rb_str_drop_bytes(str, prefixlen);
10592}
10593
10594/*
10595 * call-seq:
10596 * str.delete_prefix(prefix) -> new_str
10597 *
10598 * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10599 *
10600 * "hello".delete_prefix("hel") #=> "lo"
10601 * "hello".delete_prefix("llo") #=> "hello"
10602 */
10603
10604static VALUE
10605rb_str_delete_prefix(VALUE str, VALUE prefix)
10606{
10607 long prefixlen;
10608
10609 prefixlen = deleted_prefix_length(str, prefix);
10610 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10611
10612 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10613}
10614
10624static long
10625deleted_suffix_length(VALUE str, VALUE suffix)
10626{
10627 char *strptr, *suffixptr, *s;
10628 long olen, suffixlen;
10629 rb_encoding *enc;
10630
10631 StringValue(suffix);
10632 if (is_broken_string(suffix)) return 0;
10633 enc = rb_enc_check(str, suffix);
10634
10635 /* return 0 if not start with suffix */
10636 suffixlen = RSTRING_LEN(suffix);
10637 if (suffixlen <= 0) return 0;
10638 olen = RSTRING_LEN(str);
10639 if (olen < suffixlen) return 0;
10640 strptr = RSTRING_PTR(str);
10641 suffixptr = RSTRING_PTR(suffix);
10642 s = strptr + olen - suffixlen;
10643 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10644 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10645
10646 return suffixlen;
10647}
10648
10649/*
10650 * call-seq:
10651 * str.delete_suffix!(suffix) -> self or nil
10652 *
10653 * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10654 * <code>nil</code> if no change was made.
10655 *
10656 * "hello".delete_suffix!("llo") #=> "he"
10657 * "hello".delete_suffix!("hel") #=> nil
10658 */
10659
10660static VALUE
10661rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10662{
10663 long olen, suffixlen, len;
10664 str_modifiable(str);
10665
10666 suffixlen = deleted_suffix_length(str, suffix);
10667 if (suffixlen <= 0) return Qnil;
10668
10669 olen = RSTRING_LEN(str);
10670 str_modify_keep_cr(str);
10671 len = olen - suffixlen;
10672 STR_SET_LEN(str, len);
10673 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10674 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10676 }
10677 return str;
10678}
10679
10680/*
10681 * call-seq:
10682 * str.delete_suffix(suffix) -> new_str
10683 *
10684 * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10685 *
10686 * "hello".delete_suffix("llo") #=> "he"
10687 * "hello".delete_suffix("hel") #=> "hello"
10688 */
10689
10690static VALUE
10691rb_str_delete_suffix(VALUE str, VALUE suffix)
10692{
10693 long suffixlen;
10694
10695 suffixlen = deleted_suffix_length(str, suffix);
10696 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10697
10698 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10699}
10700
10701void
10702rb_str_setter(VALUE val, ID id, VALUE *var)
10703{
10704 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10705 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10706 }
10707 *var = val;
10708}
10709
10710static void
10711rb_fs_setter(VALUE val, ID id, VALUE *var)
10712{
10713 val = rb_fs_check(val);
10714 if (!val) {
10716 "value of %"PRIsVALUE" must be String or Regexp",
10717 rb_id2str(id));
10718 }
10719 if (!NIL_P(val)) {
10720 rb_warn_deprecated("`$;'", NULL);
10721 }
10722 *var = val;
10723}
10724
10725
10726/*
10727 * call-seq:
10728 * str.force_encoding(encoding) -> str
10729 *
10730 * Changes the encoding to +encoding+ and returns self.
10731 */
10732
10733static VALUE
10734rb_str_force_encoding(VALUE str, VALUE enc)
10735{
10736 str_modifiable(str);
10737 rb_enc_associate(str, rb_to_encoding(enc));
10739 return str;
10740}
10741
10742/*
10743 * call-seq:
10744 * str.b -> str
10745 *
10746 * Returns a copied string whose encoding is ASCII-8BIT.
10747 */
10748
10749static VALUE
10750rb_str_b(VALUE str)
10751{
10752 VALUE str2;
10753 if (FL_TEST(str, STR_NOEMBED)) {
10754 str2 = str_alloc_heap(rb_cString);
10755 }
10756 else {
10757 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10758 }
10759 str_replace_shared_without_enc(str2, str);
10760 ENC_CODERANGE_CLEAR(str2);
10761 return str2;
10762}
10763
10764/*
10765 * call-seq:
10766 * str.valid_encoding? -> true or false
10767 *
10768 * Returns true for a string which is encoded correctly.
10769 *
10770 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10771 * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10772 * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10773 */
10774
10775static VALUE
10776rb_str_valid_encoding_p(VALUE str)
10777{
10778 int cr = rb_enc_str_coderange(str);
10779
10780 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10781}
10782
10783/*
10784 * call-seq:
10785 * str.ascii_only? -> true or false
10786 *
10787 * Returns true for a string which has only ASCII characters.
10788 *
10789 * "abc".force_encoding("UTF-8").ascii_only? #=> true
10790 * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10791 */
10792
10793static VALUE
10794rb_str_is_ascii_only_p(VALUE str)
10795{
10796 int cr = rb_enc_str_coderange(str);
10797
10798 return RBOOL(cr == ENC_CODERANGE_7BIT);
10799}
10800
10801VALUE
10803{
10804 static const char ellipsis[] = "...";
10805 const long ellipsislen = sizeof(ellipsis) - 1;
10806 rb_encoding *const enc = rb_enc_get(str);
10807 const long blen = RSTRING_LEN(str);
10808 const char *const p = RSTRING_PTR(str), *e = p + blen;
10809 VALUE estr, ret = 0;
10810
10811 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10812 if (len * rb_enc_mbminlen(enc) >= blen ||
10813 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10814 ret = str;
10815 }
10816 else if (len <= ellipsislen ||
10817 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10818 if (rb_enc_asciicompat(enc)) {
10819 ret = rb_str_new(ellipsis, len);
10820 rb_enc_associate(ret, enc);
10821 }
10822 else {
10823 estr = rb_usascii_str_new(ellipsis, len);
10824 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10825 }
10826 }
10827 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10828 rb_str_cat(ret, ellipsis, ellipsislen);
10829 }
10830 else {
10831 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10832 rb_enc_from_encoding(enc), 0, Qnil);
10833 rb_str_append(ret, estr);
10834 }
10835 return ret;
10836}
10837
10838static VALUE
10839str_compat_and_valid(VALUE str, rb_encoding *enc)
10840{
10841 int cr;
10842 str = StringValue(str);
10843 cr = rb_enc_str_coderange(str);
10844 if (cr == ENC_CODERANGE_BROKEN) {
10845 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10846 }
10847 else {
10848 rb_encoding *e = STR_ENC_GET(str);
10849 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10850 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10851 rb_enc_name(enc), rb_enc_name(e));
10852 }
10853 }
10854 return str;
10855}
10856
10857static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10858
10859VALUE
10861{
10862 rb_encoding *enc = STR_ENC_GET(str);
10863 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10864}
10865
10866VALUE
10867rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10868{
10869 int cr = ENC_CODERANGE_UNKNOWN;
10870 if (enc == STR_ENC_GET(str)) {
10871 /* cached coderange makes sense only when enc equals the
10872 * actual encoding of str */
10873 cr = ENC_CODERANGE(str);
10874 }
10875 return enc_str_scrub(enc, str, repl, cr);
10876}
10877
10878static VALUE
10879enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10880{
10881 int encidx;
10882 VALUE buf = Qnil;
10883 const char *rep, *p, *e, *p1, *sp;
10884 long replen = -1;
10885 long slen;
10886
10887 if (rb_block_given_p()) {
10888 if (!NIL_P(repl))
10889 rb_raise(rb_eArgError, "both of block and replacement given");
10890 replen = 0;
10891 }
10892
10893 if (ENC_CODERANGE_CLEAN_P(cr))
10894 return Qnil;
10895
10896 if (!NIL_P(repl)) {
10897 repl = str_compat_and_valid(repl, enc);
10898 }
10899
10900 if (rb_enc_dummy_p(enc)) {
10901 return Qnil;
10902 }
10903 encidx = rb_enc_to_index(enc);
10904
10905#define DEFAULT_REPLACE_CHAR(str) do { \
10906 static const char replace[sizeof(str)-1] = str; \
10907 rep = replace; replen = (int)sizeof(replace); \
10908 } while (0)
10909
10910 slen = RSTRING_LEN(str);
10911 p = RSTRING_PTR(str);
10912 e = RSTRING_END(str);
10913 p1 = p;
10914 sp = p;
10915
10916 if (rb_enc_asciicompat(enc)) {
10917 int rep7bit_p;
10918 if (!replen) {
10919 rep = NULL;
10920 rep7bit_p = FALSE;
10921 }
10922 else if (!NIL_P(repl)) {
10923 rep = RSTRING_PTR(repl);
10924 replen = RSTRING_LEN(repl);
10925 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10926 }
10927 else if (encidx == rb_utf8_encindex()) {
10928 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10929 rep7bit_p = FALSE;
10930 }
10931 else {
10932 DEFAULT_REPLACE_CHAR("?");
10933 rep7bit_p = TRUE;
10934 }
10935 cr = ENC_CODERANGE_7BIT;
10936
10937 p = search_nonascii(p, e);
10938 if (!p) {
10939 p = e;
10940 }
10941 while (p < e) {
10942 int ret = rb_enc_precise_mbclen(p, e, enc);
10943 if (MBCLEN_NEEDMORE_P(ret)) {
10944 break;
10945 }
10946 else if (MBCLEN_CHARFOUND_P(ret)) {
10948 p += MBCLEN_CHARFOUND_LEN(ret);
10949 }
10950 else if (MBCLEN_INVALID_P(ret)) {
10951 /*
10952 * p1~p: valid ascii/multibyte chars
10953 * p ~e: invalid bytes + unknown bytes
10954 */
10955 long clen = rb_enc_mbmaxlen(enc);
10956 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10957 if (p > p1) {
10958 rb_str_buf_cat(buf, p1, p - p1);
10959 }
10960
10961 if (e - p < clen) clen = e - p;
10962 if (clen <= 2) {
10963 clen = 1;
10964 }
10965 else {
10966 const char *q = p;
10967 clen--;
10968 for (; clen > 1; clen--) {
10969 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10970 if (MBCLEN_NEEDMORE_P(ret)) break;
10971 if (MBCLEN_INVALID_P(ret)) continue;
10973 }
10974 }
10975 if (rep) {
10976 rb_str_buf_cat(buf, rep, replen);
10977 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10978 }
10979 else {
10980 repl = rb_yield(rb_enc_str_new(p, clen, enc));
10981 str_mod_check(str, sp, slen);
10982 repl = str_compat_and_valid(repl, enc);
10983 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10986 }
10987 p += clen;
10988 p1 = p;
10989 p = search_nonascii(p, e);
10990 if (!p) {
10991 p = e;
10992 break;
10993 }
10994 }
10995 else {
10997 }
10998 }
10999 if (NIL_P(buf)) {
11000 if (p == e) {
11001 ENC_CODERANGE_SET(str, cr);
11002 return Qnil;
11003 }
11004 buf = rb_str_buf_new(RSTRING_LEN(str));
11005 }
11006 if (p1 < p) {
11007 rb_str_buf_cat(buf, p1, p - p1);
11008 }
11009 if (p < e) {
11010 if (rep) {
11011 rb_str_buf_cat(buf, rep, replen);
11012 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11013 }
11014 else {
11015 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11016 str_mod_check(str, sp, slen);
11017 repl = str_compat_and_valid(repl, enc);
11018 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11021 }
11022 }
11023 }
11024 else {
11025 /* ASCII incompatible */
11026 long mbminlen = rb_enc_mbminlen(enc);
11027 if (!replen) {
11028 rep = NULL;
11029 }
11030 else if (!NIL_P(repl)) {
11031 rep = RSTRING_PTR(repl);
11032 replen = RSTRING_LEN(repl);
11033 }
11034 else if (encidx == ENCINDEX_UTF_16BE) {
11035 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11036 }
11037 else if (encidx == ENCINDEX_UTF_16LE) {
11038 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11039 }
11040 else if (encidx == ENCINDEX_UTF_32BE) {
11041 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11042 }
11043 else if (encidx == ENCINDEX_UTF_32LE) {
11044 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11045 }
11046 else {
11047 DEFAULT_REPLACE_CHAR("?");
11048 }
11049
11050 while (p < e) {
11051 int ret = rb_enc_precise_mbclen(p, e, enc);
11052 if (MBCLEN_NEEDMORE_P(ret)) {
11053 break;
11054 }
11055 else if (MBCLEN_CHARFOUND_P(ret)) {
11056 p += MBCLEN_CHARFOUND_LEN(ret);
11057 }
11058 else if (MBCLEN_INVALID_P(ret)) {
11059 const char *q = p;
11060 long clen = rb_enc_mbmaxlen(enc);
11061 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11062 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11063
11064 if (e - p < clen) clen = e - p;
11065 if (clen <= mbminlen * 2) {
11066 clen = mbminlen;
11067 }
11068 else {
11069 clen -= mbminlen;
11070 for (; clen > mbminlen; clen-=mbminlen) {
11071 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11072 if (MBCLEN_NEEDMORE_P(ret)) break;
11073 if (MBCLEN_INVALID_P(ret)) continue;
11075 }
11076 }
11077 if (rep) {
11078 rb_str_buf_cat(buf, rep, replen);
11079 }
11080 else {
11081 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11082 str_mod_check(str, sp, slen);
11083 repl = str_compat_and_valid(repl, enc);
11084 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11085 }
11086 p += clen;
11087 p1 = p;
11088 }
11089 else {
11091 }
11092 }
11093 if (NIL_P(buf)) {
11094 if (p == e) {
11096 return Qnil;
11097 }
11098 buf = rb_str_buf_new(RSTRING_LEN(str));
11099 }
11100 if (p1 < p) {
11101 rb_str_buf_cat(buf, p1, p - p1);
11102 }
11103 if (p < e) {
11104 if (rep) {
11105 rb_str_buf_cat(buf, rep, replen);
11106 }
11107 else {
11108 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11109 str_mod_check(str, sp, slen);
11110 repl = str_compat_and_valid(repl, enc);
11111 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11112 }
11113 }
11115 }
11116 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11117 return buf;
11118}
11119
11120/*
11121 * call-seq:
11122 * str.scrub -> new_str
11123 * str.scrub(repl) -> new_str
11124 * str.scrub{|bytes|} -> new_str
11125 *
11126 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11127 * character, else returns self.
11128 * If block is given, replace invalid bytes with returned value of the block.
11129 *
11130 * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11131 * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11132 * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11133 */
11134static VALUE
11135str_scrub(int argc, VALUE *argv, VALUE str)
11136{
11137 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11138 VALUE new = rb_str_scrub(str, repl);
11139 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11140}
11141
11142/*
11143 * call-seq:
11144 * str.scrub! -> str
11145 * str.scrub!(repl) -> str
11146 * str.scrub!{|bytes|} -> str
11147 *
11148 * If the string is invalid byte sequence then replace invalid bytes with given replacement
11149 * character, else returns self.
11150 * If block is given, replace invalid bytes with returned value of the block.
11151 *
11152 * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11153 * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11154 * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11155 */
11156static VALUE
11157str_scrub_bang(int argc, VALUE *argv, VALUE str)
11158{
11159 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11160 VALUE new = rb_str_scrub(str, repl);
11161 if (!NIL_P(new)) rb_str_replace(str, new);
11162 return str;
11163}
11164
11165static ID id_normalize;
11166static ID id_normalized_p;
11167static VALUE mUnicodeNormalize;
11168
11169static VALUE
11170unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11171{
11172 static int UnicodeNormalizeRequired = 0;
11173 VALUE argv2[2];
11174
11175 if (!UnicodeNormalizeRequired) {
11176 rb_require("unicode_normalize/normalize.rb");
11177 UnicodeNormalizeRequired = 1;
11178 }
11179 argv2[0] = str;
11180 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11181 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11182}
11183
11184/*
11185 * call-seq:
11186 * str.unicode_normalize(form=:nfc)
11187 *
11188 * Unicode Normalization---Returns a normalized form of +str+,
11189 * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11190 * The normalization form used is determined by +form+, which can
11191 * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11192 * The default is +:nfc+.
11193 *
11194 * If the string is not in a Unicode Encoding, then an Exception is raised.
11195 * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11196 * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11197 * Anything other than UTF-8 is implemented by converting to UTF-8,
11198 * which makes it slower than UTF-8.
11199 *
11200 * "a\u0300".unicode_normalize #=> "\u00E0"
11201 * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
11202 * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
11203 * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11204 * #=> Encoding::CompatibilityError raised
11205 */
11206static VALUE
11207rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11208{
11209 return unicode_normalize_common(argc, argv, str, id_normalize);
11210}
11211
11212/*
11213 * call-seq:
11214 * str.unicode_normalize!(form=:nfc)
11215 *
11216 * Destructive version of String#unicode_normalize, doing Unicode
11217 * normalization in place.
11218 */
11219static VALUE
11220rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11221{
11222 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11223}
11224
11225/* call-seq:
11226 * str.unicode_normalized?(form=:nfc)
11227 *
11228 * Checks whether +str+ is in Unicode normalization form +form+,
11229 * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11230 * The default is +:nfc+.
11231 *
11232 * If the string is not in a Unicode Encoding, then an Exception is raised.
11233 * For details, see String#unicode_normalize.
11234 *
11235 * "a\u0300".unicode_normalized? #=> false
11236 * "a\u0300".unicode_normalized?(:nfd) #=> true
11237 * "\u00E0".unicode_normalized? #=> true
11238 * "\u00E0".unicode_normalized?(:nfd) #=> false
11239 * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11240 * #=> Encoding::CompatibilityError raised
11241 */
11242static VALUE
11243rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11244{
11245 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11246}
11247
11248/**********************************************************************
11249 * Document-class: Symbol
11250 *
11251 * Symbol objects represent named identifiers inside the Ruby interpreter.
11252 *
11253 * You can create a \Symbol object explicitly with:
11254 *
11255 * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11256 *
11257 * The same Symbol object will be
11258 * created for a given name or string for the duration of a program's
11259 * execution, regardless of the context or meaning of that name. Thus
11260 * if <code>Fred</code> is a constant in one context, a method in
11261 * another, and a class in a third, the Symbol <code>:Fred</code>
11262 * will be the same object in all three contexts.
11263 *
11264 * module One
11265 * class Fred
11266 * end
11267 * $f1 = :Fred
11268 * end
11269 * module Two
11270 * Fred = 1
11271 * $f2 = :Fred
11272 * end
11273 * def Fred()
11274 * end
11275 * $f3 = :Fred
11276 * $f1.object_id #=> 2514190
11277 * $f2.object_id #=> 2514190
11278 * $f3.object_id #=> 2514190
11279 *
11280 * Constant, method, and variable names are returned as symbols:
11281 *
11282 * module One
11283 * Two = 2
11284 * def three; 3 end
11285 * @four = 4
11286 * @@five = 5
11287 * $six = 6
11288 * end
11289 * seven = 7
11290 *
11291 * One.constants
11292 * # => [:Two]
11293 * One.instance_methods(true)
11294 * # => [:three]
11295 * One.instance_variables
11296 * # => [:@four]
11297 * One.class_variables
11298 * # => [:@@five]
11299 * global_variables.grep(/six/)
11300 * # => [:$six]
11301 * local_variables
11302 * # => [:seven]
11303 *
11304 * Symbol objects are different from String objects in that
11305 * Symbol objects represent identifiers, while String objects
11306 * represent text or data.
11307 *
11308 * == What's Here
11309 *
11310 * First, what's elsewhere. \Class \Symbol:
11311 *
11312 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11313 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11314 *
11315 * Here, class \Symbol provides methods that are useful for:
11316 *
11317 * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11318 * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11319 * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11320 *
11321 * === Methods for Querying
11322 *
11323 * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11324 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11325 * in symbol that matches a given Regexp
11326 * or other object; returns +nil+ if no match is found.
11327 * - #[], #slice :: Returns a substring of symbol
11328 * determined by a given index, start/length, or range, or string.
11329 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11330 * - #encoding:: Returns the Encoding object that represents the encoding
11331 * of symbol.
11332 * - #end_with?:: Returns +true+ if symbol ends with
11333 * any of the given strings.
11334 * - #match:: Returns a MatchData object if symbol
11335 * matches a given Regexp; +nil+ otherwise.
11336 * - #match?:: Returns +true+ if symbol
11337 * matches a given Regexp; +false+ otherwise.
11338 * - #length, #size:: Returns the number of characters in symbol.
11339 * - #start_with?:: Returns +true+ if symbol starts with
11340 * any of the given strings.
11341 *
11342 * === Methods for Comparing
11343 *
11344 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11345 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11346 * has the same content and encoding.
11347 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11348 * symbol is smaller than, equal to, or larger than symbol.
11349 * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11350 * after Unicode case folding; +false+ otherwise.
11351 *
11352 * === Methods for Converting
11353 *
11354 * - #capitalize:: Returns symbol with the first character upcased
11355 * and all other characters downcased.
11356 * - #downcase:: Returns symbol with all characters downcased.
11357 * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11358 * - #name:: Returns the frozen string corresponding to symbol.
11359 * - #succ, #next:: Returns the symbol that is the successor to symbol.
11360 * - #swapcase:: Returns symbol with all upcase characters downcased
11361 * and all downcase characters upcased.
11362 * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11363 * - #to_s, #id2name:: Returns the string corresponding to +self+.
11364 * - #to_sym, #intern:: Returns +self+.
11365 * - #upcase:: Returns symbol with all characters upcased.
11366 *
11367 */
11368
11369
11370/*
11371 * call-seq:
11372 * sym == obj -> true or false
11373 *
11374 * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11375 * symbol, returns <code>true</code>.
11376 */
11377
11378#define sym_equal rb_obj_equal
11379
11380static int
11381sym_printable(const char *s, const char *send, rb_encoding *enc)
11382{
11383 while (s < send) {
11384 int n;
11385 int c = rb_enc_precise_mbclen(s, send, enc);
11386
11387 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11388 n = MBCLEN_CHARFOUND_LEN(c);
11389 c = rb_enc_mbc_to_codepoint(s, send, enc);
11390 if (!rb_enc_isprint(c, enc)) return FALSE;
11391 s += n;
11392 }
11393 return TRUE;
11394}
11395
11396int
11397rb_str_symname_p(VALUE sym)
11398{
11399 rb_encoding *enc;
11400 const char *ptr;
11401 long len;
11402 rb_encoding *resenc = rb_default_internal_encoding();
11403
11404 if (resenc == NULL) resenc = rb_default_external_encoding();
11405 enc = STR_ENC_GET(sym);
11406 ptr = RSTRING_PTR(sym);
11407 len = RSTRING_LEN(sym);
11408 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11409 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11410 return FALSE;
11411 }
11412 return TRUE;
11413}
11414
11415VALUE
11416rb_str_quote_unprintable(VALUE str)
11417{
11418 rb_encoding *enc;
11419 const char *ptr;
11420 long len;
11421 rb_encoding *resenc;
11422
11423 Check_Type(str, T_STRING);
11424 resenc = rb_default_internal_encoding();
11425 if (resenc == NULL) resenc = rb_default_external_encoding();
11426 enc = STR_ENC_GET(str);
11427 ptr = RSTRING_PTR(str);
11428 len = RSTRING_LEN(str);
11429 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11430 !sym_printable(ptr, ptr + len, enc)) {
11431 return rb_str_escape(str);
11432 }
11433 return str;
11434}
11435
11436MJIT_FUNC_EXPORTED VALUE
11437rb_id_quote_unprintable(ID id)
11438{
11439 VALUE str = rb_id2str(id);
11440 if (!rb_str_symname_p(str)) {
11441 return rb_str_escape(str);
11442 }
11443 return str;
11444}
11445
11446/*
11447 * call-seq:
11448 * sym.inspect -> string
11449 *
11450 * Returns the representation of <i>sym</i> as a symbol literal.
11451 *
11452 * :fred.inspect #=> ":fred"
11453 */
11454
11455static VALUE
11456sym_inspect(VALUE sym)
11457{
11458 VALUE str = rb_sym2str(sym);
11459 const char *ptr;
11460 long len;
11461 char *dest;
11462
11463 if (!rb_str_symname_p(str)) {
11464 str = rb_str_inspect(str);
11465 len = RSTRING_LEN(str);
11466 rb_str_resize(str, len + 1);
11467 dest = RSTRING_PTR(str);
11468 memmove(dest + 1, dest, len);
11469 }
11470 else {
11471 rb_encoding *enc = STR_ENC_GET(str);
11472 RSTRING_GETMEM(str, ptr, len);
11473 str = rb_enc_str_new(0, len + 1, enc);
11474 dest = RSTRING_PTR(str);
11475 memcpy(dest + 1, ptr, len);
11476 }
11477 dest[0] = ':';
11478 return str;
11479}
11480
11481#if 0 /* for RDoc */
11482/*
11483 * call-seq:
11484 * sym.name -> string
11485 *
11486 * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11487 * returned string is frozen.
11488 *
11489 * :fred.name #=> "fred"
11490 * :fred.name.frozen? #=> true
11491 * :fred.to_s #=> "fred"
11492 * :fred.to_s.frozen? #=> false
11493 */
11494VALUE
11495rb_sym2str(VALUE sym)
11496{
11497
11498}
11499#endif
11500
11501
11502/*
11503 * call-seq:
11504 * sym.id2name -> string
11505 * sym.to_s -> string
11506 *
11507 * Returns the name or string corresponding to <i>sym</i>.
11508 *
11509 * :fred.id2name #=> "fred"
11510 * :ginger.to_s #=> "ginger"
11511 *
11512 * Note that this string is not frozen (unlike the symbol itself).
11513 * To get a frozen string, use #name.
11514 */
11515
11516
11517VALUE
11519{
11520 return str_new_shared(rb_cString, rb_sym2str(sym));
11521}
11522
11523
11524/*
11525 * call-seq:
11526 * sym.to_sym -> sym
11527 * sym.intern -> sym
11528 *
11529 * In general, <code>to_sym</code> returns the Symbol corresponding
11530 * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11531 * in this case.
11532 */
11533
11534static VALUE
11535sym_to_sym(VALUE sym)
11536{
11537 return sym;
11538}
11539
11540MJIT_FUNC_EXPORTED VALUE
11541rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11542{
11543 VALUE obj;
11544
11545 if (argc < 1) {
11546 rb_raise(rb_eArgError, "no receiver given");
11547 }
11548 obj = argv[0];
11549 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11550}
11551
11552#if 0
11553/*
11554 * call-seq:
11555 * sym.to_proc
11556 *
11557 * Returns a _Proc_ object which responds to the given method by _sym_.
11558 *
11559 * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11560 */
11561
11562VALUE
11563rb_sym_to_proc(VALUE sym)
11564{
11565}
11566#endif
11567
11568/*
11569 * call-seq:
11570 *
11571 * sym.succ
11572 *
11573 * Same as <code>sym.to_s.succ.intern</code>.
11574 */
11575
11576static VALUE
11577sym_succ(VALUE sym)
11578{
11579 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11580}
11581
11582/*
11583 * call-seq:
11584 *
11585 * symbol <=> other_symbol -> -1, 0, +1, or nil
11586 *
11587 * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11588 * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11589 * less than, equal to, or greater than +other_symbol+.
11590 *
11591 * +nil+ is returned if the two values are incomparable.
11592 *
11593 * See String#<=> for more information.
11594 */
11595
11596static VALUE
11597sym_cmp(VALUE sym, VALUE other)
11598{
11599 if (!SYMBOL_P(other)) {
11600 return Qnil;
11601 }
11602 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11603}
11604
11605/*
11606 * call-seq:
11607 * casecmp(other_symbol) -> -1, 0, 1, or nil
11608 *
11609 * Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11610 *
11611 * :aBcDeF.casecmp(:abcde) # => 1
11612 * :aBcDeF.casecmp(:abcdef) # => 0
11613 * :aBcDeF.casecmp(:abcdefg) # => -1
11614 * :abcdef.casecmp(:ABCDEF) # => 0
11615 *
11616 * Returns +nil+ if the two symbols have incompatible encodings,
11617 * or if +other_symbol+ is not a symbol:
11618 *
11619 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11620 * other_sym = :"\u{c4 d6 dc}"
11621 * sym.casecmp(other_sym) # => nil
11622 * :foo.casecmp(2) # => nil
11623 *
11624 * Currently, case-insensitivity only works on characters A-Z/a-z,
11625 * not all of Unicode. This is different from Symbol#casecmp?.
11626 *
11627 * Related: Symbol#casecmp?.
11628 *
11629 */
11630
11631static VALUE
11632sym_casecmp(VALUE sym, VALUE other)
11633{
11634 if (!SYMBOL_P(other)) {
11635 return Qnil;
11636 }
11637 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11638}
11639
11640/*
11641 * call-seq:
11642 * casecmp?(other_symbol) -> true, false, or nil
11643 *
11644 * Returns +true+ if +sym+ and +other_symbol+ are equal after
11645 * Unicode case folding, +false+ if they are not equal:
11646 *
11647 * :aBcDeF.casecmp?(:abcde) # => false
11648 * :aBcDeF.casecmp?(:abcdef) # => true
11649 * :aBcDeF.casecmp?(:abcdefg) # => false
11650 * :abcdef.casecmp?(:ABCDEF) # => true
11651 * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11652 *
11653 * Returns +nil+ if the two symbols have incompatible encodings,
11654 * or if +other_symbol+ is not a symbol:
11655 *
11656 * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11657 * other_sym = :"\u{c4 d6 dc}"
11658 * sym.casecmp?(other_sym) # => nil
11659 * :foo.casecmp?(2) # => nil
11660 *
11661 * See {Case Mapping}[doc/case_mapping_rdoc.html].
11662 *
11663 * Related: Symbol#casecmp.
11664 *
11665 */
11666
11667static VALUE
11668sym_casecmp_p(VALUE sym, VALUE other)
11669{
11670 if (!SYMBOL_P(other)) {
11671 return Qnil;
11672 }
11673 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11674}
11675
11676/*
11677 * call-seq:
11678 * sym =~ obj -> integer or nil
11679 *
11680 * Returns <code>sym.to_s =~ obj</code>.
11681 */
11682
11683static VALUE
11684sym_match(VALUE sym, VALUE other)
11685{
11686 return rb_str_match(rb_sym2str(sym), other);
11687}
11688
11689/*
11690 * call-seq:
11691 * sym.match(pattern) -> matchdata or nil
11692 * sym.match(pattern, pos) -> matchdata or nil
11693 *
11694 * Returns <code>sym.to_s.match</code>.
11695 */
11696
11697static VALUE
11698sym_match_m(int argc, VALUE *argv, VALUE sym)
11699{
11700 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11701}
11702
11703/*
11704 * call-seq:
11705 * sym.match?(pattern) -> true or false
11706 * sym.match?(pattern, pos) -> true or false
11707 *
11708 * Returns <code>sym.to_s.match?</code>.
11709 */
11710
11711static VALUE
11712sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11713{
11714 return rb_str_match_m_p(argc, argv, sym);
11715}
11716
11717/*
11718 * call-seq:
11719 * sym[idx] -> char
11720 * sym[b, n] -> string
11721 * sym.slice(idx) -> char
11722 * sym.slice(b, n) -> string
11723 *
11724 * Returns <code>sym.to_s[]</code>.
11725 */
11726
11727static VALUE
11728sym_aref(int argc, VALUE *argv, VALUE sym)
11729{
11730 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11731}
11732
11733/*
11734 * call-seq:
11735 * sym.length -> integer
11736 * sym.size -> integer
11737 *
11738 * Same as <code>sym.to_s.length</code>.
11739 */
11740
11741static VALUE
11742sym_length(VALUE sym)
11743{
11744 return rb_str_length(rb_sym2str(sym));
11745}
11746
11747/*
11748 * call-seq:
11749 * sym.empty? -> true or false
11750 *
11751 * Returns whether _sym_ is :"" or not.
11752 */
11753
11754static VALUE
11755sym_empty(VALUE sym)
11756{
11757 return rb_str_empty(rb_sym2str(sym));
11758}
11759
11760/*
11761 * call-seq:
11762 * upcase(*options) -> symbol
11763 *
11764 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11765 *
11766 * See String#upcase.
11767 *
11768 */
11769
11770static VALUE
11771sym_upcase(int argc, VALUE *argv, VALUE sym)
11772{
11773 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11774}
11775
11776/*
11777 * call-seq:
11778 * downcase(*options) -> symbol
11779 *
11780 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11781 *
11782 * See String#downcase.
11783 *
11784 * Related: Symbol#upcase.
11785 *
11786 */
11787
11788static VALUE
11789sym_downcase(int argc, VALUE *argv, VALUE sym)
11790{
11791 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11792}
11793
11794/*
11795 * call-seq:
11796 * capitalize(*options) -> symbol
11797 *
11798 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11799 *
11800 * See String#capitalize.
11801 *
11802 */
11803
11804static VALUE
11805sym_capitalize(int argc, VALUE *argv, VALUE sym)
11806{
11807 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11808}
11809
11810/*
11811 * call-seq:
11812 * swapcase(*options) -> symbol
11813 *
11814 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11815 *
11816 * See String#swapcase.
11817 *
11818 */
11819
11820static VALUE
11821sym_swapcase(int argc, VALUE *argv, VALUE sym)
11822{
11823 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11824}
11825
11826/*
11827 * call-seq:
11828 * sym.start_with?([prefixes]+) -> true or false
11829 *
11830 * Returns true if +sym+ starts with one of the +prefixes+ given.
11831 * Each of the +prefixes+ should be a String or a Regexp.
11832 *
11833 * :hello.start_with?("hell") #=> true
11834 * :hello.start_with?(/H/i) #=> true
11835 *
11836 * # returns true if one of the prefixes matches.
11837 * :hello.start_with?("heaven", "hell") #=> true
11838 * :hello.start_with?("heaven", "paradise") #=> false
11839 */
11840
11841static VALUE
11842sym_start_with(int argc, VALUE *argv, VALUE sym)
11843{
11844 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11845}
11846
11847/*
11848 * call-seq:
11849 * sym.end_with?([suffixes]+) -> true or false
11850 *
11851 * Returns true if +sym+ ends with one of the +suffixes+ given.
11852 *
11853 * :hello.end_with?("ello") #=> true
11854 *
11855 * # returns true if one of the +suffixes+ matches.
11856 * :hello.end_with?("heaven", "ello") #=> true
11857 * :hello.end_with?("heaven", "paradise") #=> false
11858 */
11859
11860static VALUE
11861sym_end_with(int argc, VALUE *argv, VALUE sym)
11862{
11863 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11864}
11865
11866/*
11867 * call-seq:
11868 * sym.encoding -> encoding
11869 *
11870 * Returns the Encoding object that represents the encoding of _sym_.
11871 */
11872
11873static VALUE
11874sym_encoding(VALUE sym)
11875{
11876 return rb_obj_encoding(rb_sym2str(sym));
11877}
11878
11879static VALUE
11880string_for_symbol(VALUE name)
11881{
11882 if (!RB_TYPE_P(name, T_STRING)) {
11883 VALUE tmp = rb_check_string_type(name);
11884 if (NIL_P(tmp)) {
11885 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11886 name);
11887 }
11888 name = tmp;
11889 }
11890 return name;
11891}
11892
11893ID
11895{
11896 if (SYMBOL_P(name)) {
11897 return SYM2ID(name);
11898 }
11899 name = string_for_symbol(name);
11900 return rb_intern_str(name);
11901}
11902
11903VALUE
11905{
11906 if (SYMBOL_P(name)) {
11907 return name;
11908 }
11909 name = string_for_symbol(name);
11910 return rb_str_intern(name);
11911}
11912
11913/*
11914 * call-seq:
11915 * Symbol.all_symbols => array
11916 *
11917 * Returns an array of all the symbols currently in Ruby's symbol
11918 * table.
11919 *
11920 * Symbol.all_symbols.size #=> 903
11921 * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11922 * :chown, :EOFError, :$;, :String,
11923 * :LOCK_SH, :"setuid?", :$<,
11924 * :default_proc, :compact, :extend,
11925 * :Tms, :getwd, :$=, :ThreadGroup,
11926 * :wait2, :$>]
11927 */
11928
11929static VALUE
11930sym_all_symbols(VALUE _)
11931{
11932 return rb_sym_all_symbols();
11933}
11934
11935VALUE
11937{
11938 return rb_fstring(str);
11939}
11940
11941VALUE
11942rb_interned_str(const char *ptr, long len)
11943{
11944 struct RString fake_str;
11945 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11946}
11947
11948VALUE
11950{
11951 return rb_interned_str(ptr, strlen(ptr));
11952}
11953
11954VALUE
11955rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11956{
11957 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11958 rb_enc_autoload(enc);
11959 }
11960
11961 struct RString fake_str;
11962 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11963}
11964
11965VALUE
11967{
11968 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11969}
11970
11971/*
11972 * A \String object has an arbitrary sequence of bytes,
11973 * typically representing text or binary data.
11974 * A \String object may be created using String::new or as literals.
11975 *
11976 * String objects differ from Symbol objects in that Symbol objects are
11977 * designed to be used as identifiers, instead of text or data.
11978 *
11979 * You can create a \String object explicitly with:
11980 *
11981 * - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11982 * - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11983 *
11984 * You can convert certain objects to Strings with:
11985 *
11986 * - \Method {String}[Kernel.html#method-i-String].
11987 *
11988 * Some \String methods modify +self+.
11989 * Typically, a method whose name ends with <tt>!</tt> modifies +self+
11990 * and returns +self+;
11991 * often a similarly named method (without the <tt>!</tt>)
11992 * returns a new string.
11993 *
11994 * In general, if there exist both bang and non-bang version of method,
11995 * the bang! mutates and the non-bang! does not.
11996 * However, a method without a bang can also mutate, such as String#replace.
11997 *
11998 * == Substitution Methods
11999 *
12000 * These methods perform substitutions:
12001 *
12002 * - String#sub: One substitution (or none); returns a new string.
12003 * - String#sub!: One substitution (or none); returns +self+.
12004 * - String#gsub: Zero or more substitutions; returns a new string.
12005 * - String#gsub!: Zero or more substitutions; returns +self+.
12006 *
12007 * Each of these methods takes:
12008 *
12009 * - A first argument, +pattern+ (string or regexp),
12010 * that specifies the substring(s) to be replaced.
12011 *
12012 * - Either of these:
12013 *
12014 * - A second argument, +replacement+ (string or hash),
12015 * that determines the replacing string.
12016 * - A block that will determine the replacing string.
12017 *
12018 * The examples in this section mostly use methods String#sub and String#gsub;
12019 * the principles illustrated apply to all four substitution methods.
12020 *
12021 * <b>Argument +pattern+</b>
12022 *
12023 * Argument +pattern+ is commonly a regular expression:
12024 *
12025 * s = 'hello'
12026 * s.sub(/[aeiou]/, '*') # => "h*llo"
12027 * s.gsub(/[aeiou]/, '*') # => "h*ll*"
12028 * s.gsub(/[aeiou]/, '') # => "hll"
12029 * s.sub(/ell/, 'al') # => "halo"
12030 * s.gsub(/xyzzy/, '*') # => "hello"
12031 * 'THX1138'.gsub(/\d+/, '00') # => "THX00"
12032 *
12033 * When +pattern+ is a string, all its characters are treated
12034 * as ordinary characters (not as regexp special characters):
12035 *
12036 * 'THX1138'.gsub('\d+', '00') # => "THX1138"
12037 *
12038 * <b>\String +replacement+</b>
12039 *
12040 * If +replacement+ is a string, that string will determine
12041 * the replacing string that is to be substituted for the matched text.
12042 *
12043 * Each of the examples above uses a simple string as the replacing string.
12044 *
12045 * \String +replacement+ may contain back-references to the pattern's captures:
12046 *
12047 * - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12048 * - <tt>\k<name></tt> refers to the named capture +name+.
12049 *
12050 * See rdoc-ref:regexp.rdoc for details.
12051 *
12052 * Note that within the string +replacement+, a character combination
12053 * such as <tt>$&</tt> is treated as ordinary text, and not as
12054 * a special match variable.
12055 * However, you may refer to some special match variables using these
12056 * combinations:
12057 *
12058 * - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12059 * which contains the complete matched text.
12060 * - <tt>\'</tt> corresponds to <tt>$'</tt>,
12061 * which contains string after match.
12062 * - <tt>\`</tt> corresponds to <tt>$`</tt>,
12063 * which contains string before match.
12064 * - <tt>\+</tt> corresponds to <tt>$+</tt>,
12065 * which contains last capture group.
12066 *
12067 * See rdoc-ref:regexp.rdoc for details.
12068 *
12069 * Note that <tt>\\\</tt> is interpreted as an escape, i.e., a single backslash.
12070 *
12071 * Note also that a string literal consumes backslashes.
12072 * See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12073 *
12074 * A back-reference is typically preceded by an additional backslash.
12075 * For example, if you want to write a back-reference <tt>\&</tt> in
12076 * +replacement+ with a double-quoted string literal, you need to write
12077 * <tt>"..\\\\&.."</tt>.
12078 *
12079 * If you want to write a non-back-reference string <tt>\&</tt> in
12080 * +replacement+, you need first to escape the backslash to prevent
12081 * this method from interpreting it as a back-reference, and then you
12082 * need to escape the backslashes again to prevent a string literal from
12083 * consuming them: <tt>"..\\\\\\\\&.."</tt>.
12084 *
12085 * You may want to use the block form to avoid a lot of backslashes.
12086 *
12087 * <b>\Hash +replacement+</b>
12088 *
12089 * If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12090 * the replacing string is the value for that key:
12091 *
12092 * h = {'foo' => 'bar', 'baz' => 'bat'}
12093 * 'food'.sub('foo', h) # => "bard"
12094 *
12095 * Note that a symbol key does not match:
12096 *
12097 * h = {foo: 'bar', baz: 'bat'}
12098 * 'food'.sub('foo', h) # => "d"
12099 *
12100 * <b>Block</b>
12101 *
12102 * In the block form, the current match string is passed to the block;
12103 * the block's return value becomes the replacing string:
12104 *
12105 * s = '@'
12106 * '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12107 *
12108 * Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12109 * <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12110 *
12111 *
12112 * == What's Here
12113 *
12114 * First, what's elsewhere. \Class \String:
12115 *
12116 * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12117 * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12118 *
12119 * Here, class \String provides methods that are useful for:
12120 *
12121 * - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12122 * - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12123 * - {Querying}[#class-String-label-Methods+for+Querying]
12124 * - {Comparing}[#class-String-label-Methods+for+Comparing]
12125 * - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12126 * - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12127 * - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12128 * - {Iterating}[#class-String-label-Methods+for+Iterating]
12129 *
12130 * === Methods for Creating a \String
12131 *
12132 * - ::new:: Returns a new string.
12133 * - ::try_convert:: Returns a new string created from a given object.
12134 *
12135 * === Methods for a Frozen/Unfrozen String
12136 *
12137 * - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12138 * +self+, if not frozen; +self.dup+ otherwise.
12139 * - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12140 * +self+, if already frozen; +self.freeze+ otherwise.
12141 * - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12142 *
12143 * === Methods for Querying
12144 *
12145 * _Counts_
12146 *
12147 * - #length, #size:: Returns the count of characters (not bytes).
12148 * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12149 * - #bytesize:: Returns the count of bytes.
12150 * - #count:: Returns the count of substrings matching given strings.
12151 *
12152 * _Substrings_
12153 *
12154 * - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12155 * returns +nil+ if no match is found.
12156 * - #index:: Returns the index of the _first_ occurrence of a given substring;
12157 * returns +nil+ if none found.
12158 * - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12159 * returns +nil+ if none found.
12160 * - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12161 * - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12162 * - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12163 * - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12164 * - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12165 *
12166 * _Encodings_
12167 *
12168 * - #encoding:: Returns the Encoding object that represents the encoding of the string.
12169 * - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12170 * - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12171 * for its encoding.
12172 * - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12173 *
12174 * _Other_
12175 *
12176 * - #sum:: Returns a basic checksum for the string: the sum of each byte.
12177 * - #hash:: Returns the integer hash code.
12178 *
12179 * === Methods for Comparing
12180 *
12181 * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12182 * - #eql?:: Returns +true+ if the content is the same as the given other string.
12183 * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12184 * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12185 * other string is smaller than, equal to, or larger than +self+.
12186 * - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12187 * +false+ otherwise.
12188 *
12189 * === Methods for Modifying a \String
12190 *
12191 * Each of these methods modifies +self+.
12192 *
12193 * _Insertion_
12194 *
12195 * - #insert:: Returns +self+ with a given string inserted at a given offset.
12196 * - #<<:: Returns +self+ concatenated with a given string or integer.
12197 *
12198 * _Substitution_
12199 *
12200 * - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12201 * returns +self+ if any changes, +nil+ otherwise.
12202 * - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12203 * returns +self+ if any changes, +nil+ otherwise.
12204 * - #succ!, #next!:: Returns +self+ modified to become its own successor.
12205 * - #replace:: Returns +self+ with its entire content replaced by a given string.
12206 * - #reverse!:: Returns +self+ with its characters in reverse order.
12207 * - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12208 * - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12209 * returns +self+ if any changes, +nil+ otherwise.
12210 * - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12211 * removing duplicates from the substrings that were modified;
12212 * returns +self+ if any changes, +nil+ otherwise.
12213 *
12214 * _Casing_
12215 *
12216 * - #capitalize!:: Upcases the initial character and downcases all others;
12217 * returns +self+ if any changes, +nil+ otherwise.
12218 * - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12219 * - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12220 * - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12221 * returns +self+ if any changes, +nil+ otherwise.
12222 *
12223 * _Encoding_
12224 *
12225 * - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12226 * - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12227 * - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12228 * - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12229 *
12230 * _Deletion_
12231 *
12232 * - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12233 * - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12234 * - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12235 * - #delete!:: Removes characters as determined by the intersection of substring arguments.
12236 * - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12237 * - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12238 * - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12239 * - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12240 * - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12241 * returns +self+ if any changes, +nil+ otherwise.
12242 *
12243 * === Methods for Converting to New \String
12244 *
12245 * Each of these methods returns a new \String based on +self+,
12246 * often just a modified copy of +self+.
12247 *
12248 * _Extension_
12249 *
12250 * - #*:: Returns the concatenation of multiple copies of +self+,
12251 * - #+:: Returns the concatenation of +self+ and a given other string.
12252 * - #center:: Returns a copy of +self+ centered between pad substring.
12253 * - #concat:: Returns the concatenation of +self+ with given other strings.
12254 * - #prepend:: Returns the concatenation of a given other string with +self+.
12255 * - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12256 * - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12257 *
12258 * _Encoding_
12259 *
12260 * - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12261 * - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12262 * - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12263 * - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12264 *
12265 * _Substitution_
12266 *
12267 * - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12268 * and all special characters escaped.
12269 * - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12270 * and all escaped characters unescaped.
12271 * - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12272 * replaced with a given replacement string;.
12273 * - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12274 * replaced with a given replacement string.
12275 * - #succ, #next:: Returns the string that is the successor to +self+.
12276 * - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12277 * - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12278 * - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12279 * removing duplicates from the substrings that were modified.
12280 * - #%:: Returns the string resulting from formatting a given object into +self+
12281 *
12282 * _Casing_
12283 *
12284 * - #capitalize:: Returns a copy of +self+ with the first character upcased
12285 * and all other characters downcased.
12286 * - #downcase:: Returns a copy of +self+ with all characters downcased.
12287 * - #upcase:: Returns a copy of +self+ with all characters upcased.
12288 * - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12289 * and all downcase characters upcased.
12290 *
12291 * _Deletion_
12292 *
12293 * - #delete:: Returns a copy of +self+ with characters removed
12294 * - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12295 * - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12296 * - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12297 * - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12298 * - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12299 * - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12300 * - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12301 * - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12302 * - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12303 * - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12304 * - #chr:: Returns the first character.
12305 *
12306 * _Duplication_
12307 *
12308 * - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12309 * otherwise, returns +self+.
12310 *
12311 * === Methods for Converting to Non-\String
12312 *
12313 * Each of these methods converts the contents of +self+ to a non-\String.
12314 *
12315 * <em>Characters, Bytes, and Clusters</em>
12316 *
12317 * - #bytes:: Returns an array of the bytes in +self+.
12318 * - #chars:: Returns an array of the characters in +self+.
12319 * - #codepoints:: Returns an array of the integer ordinals in +self+.
12320 * - #getbyte:: Returns an integer byte as determined by a given index.
12321 * - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12322 *
12323 * _Splitting_
12324 *
12325 * - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12326 * - #partition:: Returns a 3-element array determined by the first substring that matches
12327 * a given substring or regexp,
12328 * - #rpartition:: Returns a 3-element array determined by the last substring that matches
12329 * a given substring or regexp,
12330 * - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12331 * or, if a block given, passes those substrings to the block.
12332 *
12333 * _Matching_
12334 *
12335 * - #scan:: Returns an array of substrings matching a given regexp or string, or,
12336 * if a block given, passes each matching substring to the block.
12337 * - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12338 * - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12339 *
12340 * _Numerics_
12341 *
12342 * - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12343 * - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12344 * - #ord:: Returns the integer ordinal of the first character in +self+.
12345 * - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12346 * - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12347 *
12348 * <em>Strings and Symbols</em>
12349 *
12350 * - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12351 * - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12352 *
12353 * === Methods for Iterating
12354 *
12355 * - #each_byte:: Calls the given block with each successive byte in +self+.
12356 * - #each_char:: Calls the given block with each successive character in +self+.
12357 * - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12358 * - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12359 * - #each_line:: Calls the given block with each successive line in +self+,
12360 * as determined by a given record separator.
12361 * - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12362 */
12363
12364void
12365Init_String(void)
12366{
12367 rb_cString = rb_define_class("String", rb_cObject);
12368 assert(rb_vm_fstring_table());
12369 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12371 rb_define_alloc_func(rb_cString, empty_str_alloc);
12372 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12373 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12374 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12375 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12378 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12379 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12380 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12381 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12384 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12385 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12386 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12387 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12390 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12391 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12392 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12393 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12394 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12396 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12398 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12399 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12400 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12401 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12403 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12404 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12405 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12406 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12407 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12408 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12409 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12411 rb_define_method(rb_cString, "+@", str_uplus, 0);
12412 rb_define_method(rb_cString, "-@", str_uminus, 0);
12413
12414 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12415 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12416 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12417 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12420 rb_define_method(rb_cString, "undump", str_undump, 0);
12421
12422 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12423 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12424 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12425 sym_fold = ID2SYM(rb_intern_const("fold"));
12426
12427 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12428 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12429 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12430 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12431
12432 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12433 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12434 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12435 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12436
12437 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12438 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12439 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12440 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12441 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12442 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12443 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12444 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12445 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12446 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12447 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12449 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12450 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12451 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12452 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12453 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12454
12455 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12456 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12457 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12458
12459 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12460
12461 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12462 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12463 rb_define_method(rb_cString, "center", rb_str_center, -1);
12464
12465 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12466 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12467 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12468 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12469 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12470 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12471 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12472 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12473 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12474
12475 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12476 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12477 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12478 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12479 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12480 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12481 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12482 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12483 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12484
12485 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12486 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12487 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12488 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12489 rb_define_method(rb_cString, "count", rb_str_count, -1);
12490
12491 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12492 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12493 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12494 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12495
12496 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12497 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12498 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12499 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12500 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12501
12502 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12503
12504 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12505 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12506
12507 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12508 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12509
12510 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12511 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12512 rb_define_method(rb_cString, "b", rb_str_b, 0);
12513 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12514 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12515
12516 /* define UnicodeNormalize module here so that we don't have to look it up */
12517 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12518 id_normalize = rb_intern_const("normalize");
12519 id_normalized_p = rb_intern_const("normalized?");
12520
12521 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12522 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12523 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12524
12525 rb_fs = Qnil;
12526 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12527 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12528 rb_gc_register_address(&rb_fs);
12529
12530 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12534 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12535
12536 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12537 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12538 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12540 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12542 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
12543 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
12544 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
12545 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12546 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12547
12548 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12549 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12550 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12551 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12552
12553 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12554 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12555 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12556 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12557 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12558 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12559 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12560
12561 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12562 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12563 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12564 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12565
12566 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12567 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12568
12569 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12570}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
#define RUBY_ATOMIC_CAS(var, oldval, newval)
Atomic compare-and-swap.
Definition atomic.h:138
std::atomic< unsigned > rb_atomic_t
Type that is eligible for atomic operations.
Definition atomic.h:69
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:166
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1182
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implenentation detail of RB_OBJ_FROZEN().
Definition fl_type.h:912
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:356
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1043
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:837
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:948
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:1938
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2406
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:854
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2195
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:142
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1746
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:67
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:145
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1747
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:30
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:144
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:143
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:31
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:533
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:140
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:137
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:534
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:535
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:97
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:532
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:139
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:68
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:141
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:146
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
Definition error.c:428
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition error.c:3025
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:675
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3137
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition error.c:802
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1103
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1099
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition error.c:3076
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1106
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1097
VALUE rb_eArgError
ArgumentError exception.
Definition error.c:1100
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1101
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:553
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:1909
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1173
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3325
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:188
VALUE rb_cSymbol
Sumbol class.
Definition string.c:81
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:120
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1161
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:80
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:2998
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition rgengc.h:220
Encoding relates APIs.
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition encoding.h:433
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:697
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition encoding.h:676
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:718
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition encoding.h:782
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:587
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition encoding.h:657
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:463
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition encoding.h:607
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:448
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:635
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:740
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1182
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:776
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1034
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2735
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1067
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:11955
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:247
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2071
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3271
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:980
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
Definition string.c:940
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1287
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1188
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:790
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:11966
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:668
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:406
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1449
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2614
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2929
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1705
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1102
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1189
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:278
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:294
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:553
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:204
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1580
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:991
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1586
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1578
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1197
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:3659
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3260
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1377
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1793
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:11936
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1546
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1350
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2232
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1646
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3317
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1263
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11518
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2304
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1239
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1540
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2763
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4564
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3526
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:2821
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:10802
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1720
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1545
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1593
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1745
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1016
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1579
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:828
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1356
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1808
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition string.c:2459
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3516
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3161
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2160
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1814
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1702
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1630
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:5841
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2844
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1177
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:11949
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1269
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1667
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3302
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2810
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3628
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3039
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6456
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2511
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:11942
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3582
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3418
VALUE rb_tainted_str_new(const char *ptr, long len)
Definition string.c:1040
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3557
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1756
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3278
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:2963
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5145
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:10860
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1688
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1487
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:662
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2659
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2931
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1719
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3022
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3056
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1028
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1596
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2467
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6567
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1251
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1506
#define rb_tainted_str_new_cstr(str)
Definition string.h:1613
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2180
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1561
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5071
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8676
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1022
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:837
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1657
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2765
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1117
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:276
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition symbol.c:782
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:924
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:11904
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition string.c:11894
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition symbol.c:788
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1697
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3053
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:3856
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1357
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:324
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:69
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition rgengc.h:107
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:139
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:72
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1281
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2636
static long RSTRING_EMBED_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:423
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:553
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:527
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:573
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2520
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:483
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1275
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2531
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1584
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition rstring.h:497
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:95
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:441
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1270
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:231
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:234
long capa
Capacity of *ptr.
Definition rstring.h:268
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:250
char ary[RSTRING_EMBED_LEN_MAX+1]
When a string is short enough, it uses this area to store the contents themselves.
Definition rstring.h:298
union RString::@46::@47::@49 aux
Auxiliary info.
struct RString::@46::@48 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:276
char * ptr
Pointer to the contents of the string.
Definition rstring.h:258
struct RString::@46::@47 heap
Strings that use separated memory region for contents use this pattern.
union RString::@46 as
String's specific fields.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:190
Definition st.h:79
Definition string.c:7522
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:440
void rb_nativethread_lock_initialize(rb_nativethread_lock_t *lock)
Fills the passed lock with an initial value.
Definition thread.c:428
void rb_nativethread_lock_destroy(rb_nativethread_lock_t *lock)
Destroys the passed mutex.
Definition thread.c:434
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:432
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:375