Ruby  1.9.3p537(2014-02-19revision0)
ext/strscan/strscan.c
Go to the documentation of this file.
00001 /*
00002     $Id$
00003 
00004     Copyright (c) 1999-2006 Minero Aoki
00005 
00006     This program is free software.
00007     You can distribute/modify this program under the terms of
00008     the Ruby License. For details, see the file COPYING.
00009 */
00010 
00011 #include "ruby/ruby.h"
00012 #include "ruby/re.h"
00013 #include "ruby/encoding.h"
00014 
00015 #define STRSCAN_VERSION "0.7.0"
00016 
00017 #ifdef PRIsVALUE
00018 # define RB_OBJ_CLASSNAME(obj) rb_obj_class(obj)
00019 # define RB_OBJ_STRING(obj) (obj)
00020 #else
00021 # define PRIsVALUE "s"
00022 # define RB_OBJ_CLASSNAME(obj) rb_obj_classname(obj)
00023 # define RB_OBJ_STRING(obj) StringValueCStr(obj)
00024 #endif
00025 
00026 /* =======================================================================
00027                          Data Type Definitions
00028    ======================================================================= */
00029 
00030 static VALUE StringScanner;
00031 static VALUE ScanError;
00032 
00033 struct strscanner
00034 {
00035     /* multi-purpose flags */
00036     unsigned long flags;
00037 #define FLAG_MATCHED (1 << 0)
00038 
00039     /* the string to scan */
00040     VALUE str;
00041 
00042     /* scan pointers */
00043     long prev;   /* legal only when MATCHED_P(s) */
00044     long curr;   /* always legal */
00045 
00046     /* the regexp register; legal only when MATCHED_P(s) */
00047     struct re_registers regs;
00048 };
00049 
00050 #define MATCHED_P(s)          ((s)->flags & FLAG_MATCHED)
00051 #define MATCHED(s)             (s)->flags |= FLAG_MATCHED
00052 #define CLEAR_MATCH_STATUS(s)  (s)->flags &= ~FLAG_MATCHED
00053 
00054 #define S_PBEG(s)  (RSTRING_PTR((s)->str))
00055 #define S_LEN(s)  (RSTRING_LEN((s)->str))
00056 #define S_PEND(s)  (S_PBEG(s) + S_LEN(s))
00057 #define CURPTR(s) (S_PBEG(s) + (s)->curr)
00058 #define S_RESTLEN(s) (S_LEN(s) - (s)->curr)
00059 
00060 #define EOS_P(s) ((s)->curr >= RSTRING_LEN(p->str))
00061 
00062 #define GET_SCANNER(obj,var) do {\
00063     Data_Get_Struct((obj), struct strscanner, (var));\
00064     if (NIL_P((var)->str)) rb_raise(rb_eArgError, "uninitialized StringScanner object");\
00065 } while (0)
00066 
00067 /* =======================================================================
00068                             Function Prototypes
00069    ======================================================================= */
00070 
00071 static VALUE infect _((VALUE str, struct strscanner *p));
00072 static VALUE extract_range _((struct strscanner *p, long beg_i, long end_i));
00073 static VALUE extract_beg_len _((struct strscanner *p, long beg_i, long len));
00074 
00075 void check_strscan _((VALUE obj));
00076 static void strscan_mark _((struct strscanner *p));
00077 static void strscan_free _((struct strscanner *p));
00078 static VALUE strscan_s_allocate _((VALUE klass));
00079 static VALUE strscan_initialize _((int argc, VALUE *argv, VALUE self));
00080 static VALUE strscan_init_copy _((VALUE vself, VALUE vorig));
00081 
00082 static VALUE strscan_s_mustc _((VALUE self));
00083 static VALUE strscan_terminate _((VALUE self));
00084 static VALUE strscan_clear _((VALUE self));
00085 static VALUE strscan_get_string _((VALUE self));
00086 static VALUE strscan_set_string _((VALUE self, VALUE str));
00087 static VALUE strscan_concat _((VALUE self, VALUE str));
00088 static VALUE strscan_get_pos _((VALUE self));
00089 static VALUE strscan_set_pos _((VALUE self, VALUE pos));
00090 static VALUE strscan_do_scan _((VALUE self, VALUE regex,
00091                                 int succptr, int getstr, int headonly));
00092 static VALUE strscan_scan _((VALUE self, VALUE re));
00093 static VALUE strscan_match_p _((VALUE self, VALUE re));
00094 static VALUE strscan_skip _((VALUE self, VALUE re));
00095 static VALUE strscan_check _((VALUE self, VALUE re));
00096 static VALUE strscan_scan_full _((VALUE self, VALUE re,
00097                                   VALUE succp, VALUE getp));
00098 static VALUE strscan_scan_until _((VALUE self, VALUE re));
00099 static VALUE strscan_skip_until _((VALUE self, VALUE re));
00100 static VALUE strscan_check_until _((VALUE self, VALUE re));
00101 static VALUE strscan_search_full _((VALUE self, VALUE re,
00102                                     VALUE succp, VALUE getp));
00103 static void adjust_registers_to_matched _((struct strscanner *p));
00104 static VALUE strscan_getch _((VALUE self));
00105 static VALUE strscan_get_byte _((VALUE self));
00106 static VALUE strscan_getbyte _((VALUE self));
00107 static VALUE strscan_peek _((VALUE self, VALUE len));
00108 static VALUE strscan_peep _((VALUE self, VALUE len));
00109 static VALUE strscan_unscan _((VALUE self));
00110 static VALUE strscan_bol_p _((VALUE self));
00111 static VALUE strscan_eos_p _((VALUE self));
00112 static VALUE strscan_empty_p _((VALUE self));
00113 static VALUE strscan_rest_p _((VALUE self));
00114 static VALUE strscan_matched_p _((VALUE self));
00115 static VALUE strscan_matched _((VALUE self));
00116 static VALUE strscan_matched_size _((VALUE self));
00117 static VALUE strscan_aref _((VALUE self, VALUE idx));
00118 static VALUE strscan_pre_match _((VALUE self));
00119 static VALUE strscan_post_match _((VALUE self));
00120 static VALUE strscan_rest _((VALUE self));
00121 static VALUE strscan_rest_size _((VALUE self));
00122 
00123 static VALUE strscan_inspect _((VALUE self));
00124 static VALUE inspect1 _((struct strscanner *p));
00125 static VALUE inspect2 _((struct strscanner *p));
00126 
00127 /* =======================================================================
00128                                    Utils
00129    ======================================================================= */
00130 
00131 static VALUE
00132 infect(VALUE str, struct strscanner *p)
00133 {
00134     OBJ_INFECT(str, p->str);
00135     return str;
00136 }
00137 
00138 static VALUE
00139 str_new(struct strscanner *p, const char *ptr, long len)
00140 {
00141     VALUE str = rb_str_new(ptr, len);
00142     rb_enc_copy(str, p->str);
00143     return str;
00144 }
00145 
00146 static VALUE
00147 extract_range(struct strscanner *p, long beg_i, long end_i)
00148 {
00149     if (beg_i > S_LEN(p)) return Qnil;
00150     if (end_i > S_LEN(p))
00151         end_i = S_LEN(p);
00152     return infect(str_new(p, S_PBEG(p) + beg_i, end_i - beg_i), p);
00153 }
00154 
00155 static VALUE
00156 extract_beg_len(struct strscanner *p, long beg_i, long len)
00157 {
00158     if (beg_i > S_LEN(p)) return Qnil;
00159     if (beg_i + len > S_LEN(p))
00160         len = S_LEN(p) - beg_i;
00161     return infect(str_new(p, S_PBEG(p) + beg_i, len), p);
00162 }
00163 
00164 /* =======================================================================
00165                                Constructor
00166    ======================================================================= */
00167 
00168 static void
00169 strscan_mark(struct strscanner *p)
00170 {
00171     rb_gc_mark(p->str);
00172 }
00173 
00174 static void
00175 strscan_free(struct strscanner *p)
00176 {
00177     onig_region_free(&(p->regs), 0);
00178     ruby_xfree(p);
00179 }
00180 
00181 static VALUE
00182 strscan_s_allocate(VALUE klass)
00183 {
00184     struct strscanner *p;
00185 
00186     p = ALLOC(struct strscanner);
00187     MEMZERO(p, struct strscanner, 1);
00188     CLEAR_MATCH_STATUS(p);
00189     onig_region_init(&(p->regs));
00190     p->str = Qnil;
00191     return Data_Wrap_Struct(klass, strscan_mark, strscan_free, p);
00192 }
00193 
00194 /*
00195  * call-seq: StringScanner.new(string, dup = false)
00196  *
00197  * Creates a new StringScanner object to scan over the given +string+.
00198  * +dup+ argument is obsolete and not used now.
00199  */
00200 static VALUE
00201 strscan_initialize(int argc, VALUE *argv, VALUE self)
00202 {
00203     struct strscanner *p;
00204     VALUE str, need_dup;
00205 
00206     Data_Get_Struct(self, struct strscanner, p);
00207     rb_scan_args(argc, argv, "11", &str, &need_dup);
00208     StringValue(str);
00209     p->str = str;
00210 
00211     return self;
00212 }
00213 
00214 void
00215 check_strscan(VALUE obj)
00216 {
00217     if (TYPE(obj) != T_DATA || RDATA(obj)->dmark != (RUBY_DATA_FUNC)strscan_mark) {
00218         rb_raise(rb_eTypeError,
00219                  "wrong argument type %s (expected StringScanner)",
00220                  rb_obj_classname(obj));
00221     }
00222 }
00223 
00224 /*
00225  * call-seq:
00226  *   dup
00227  *   clone
00228  *
00229  * Duplicates a StringScanner object.
00230  */
00231 static VALUE
00232 strscan_init_copy(VALUE vself, VALUE vorig)
00233 {
00234     struct strscanner *self, *orig;
00235 
00236     Data_Get_Struct(vself, struct strscanner, self);
00237     check_strscan(vorig);
00238     Data_Get_Struct(vorig, struct strscanner, orig);
00239     if (self != orig) {
00240         self->flags = orig->flags;
00241         self->str = orig->str;
00242         self->prev = orig->prev;
00243         self->curr = orig->curr;
00244         onig_region_copy(&self->regs, &orig->regs);
00245     }
00246 
00247     return vself;
00248 }
00249 
00250 /* =======================================================================
00251                           Instance Methods
00252    ======================================================================= */
00253 
00254 /*
00255  * call-seq: StringScanner.must_C_version
00256  *
00257  * This method is defined for backward compatibility.
00258  */
00259 static VALUE
00260 strscan_s_mustc(VALUE self)
00261 {
00262     return self;
00263 }
00264 
00265 /*
00266  * Reset the scan pointer (index 0) and clear matching data.
00267  */
00268 static VALUE
00269 strscan_reset(VALUE self)
00270 {
00271     struct strscanner *p;
00272 
00273     GET_SCANNER(self, p);
00274     p->curr = 0;
00275     CLEAR_MATCH_STATUS(p);
00276     return self;
00277 }
00278 
00279 /*
00280  * call-seq:
00281  *   terminate
00282  *   clear
00283  *
00284  * Set the scan pointer to the end of the string and clear matching data.
00285  */
00286 static VALUE
00287 strscan_terminate(VALUE self)
00288 {
00289     struct strscanner *p;
00290 
00291     GET_SCANNER(self, p);
00292     p->curr = S_LEN(p);
00293     CLEAR_MATCH_STATUS(p);
00294     return self;
00295 }
00296 
00297 /*
00298  * Equivalent to #terminate.
00299  * This method is obsolete; use #terminate instead.
00300  */
00301 static VALUE
00302 strscan_clear(VALUE self)
00303 {
00304     rb_warning("StringScanner#clear is obsolete; use #terminate instead");
00305     return strscan_terminate(self);
00306 }
00307 
00308 /*
00309  * Returns the string being scanned.
00310  */
00311 static VALUE
00312 strscan_get_string(VALUE self)
00313 {
00314     struct strscanner *p;
00315 
00316     GET_SCANNER(self, p);
00317     return p->str;
00318 }
00319 
00320 /*
00321  * call-seq: string=(str)
00322  *
00323  * Changes the string being scanned to +str+ and resets the scanner.
00324  * Returns +str+.
00325  */
00326 static VALUE
00327 strscan_set_string(VALUE self, VALUE str)
00328 {
00329     struct strscanner *p;
00330 
00331     Data_Get_Struct(self, struct strscanner, p);
00332     StringValue(str);
00333     p->str = str;
00334     p->curr = 0;
00335     CLEAR_MATCH_STATUS(p);
00336     return str;
00337 }
00338 
00339 /*
00340  * call-seq:
00341  *   concat(str)
00342  *   <<(str)
00343  *
00344  * Appends +str+ to the string being scanned.
00345  * This method does not affect scan pointer.
00346  *
00347  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00348  *   s.scan(/Fri /)
00349  *   s << " +1000 GMT"
00350  *   s.string            # -> "Fri Dec 12 1975 14:39 +1000 GMT"
00351  *   s.scan(/Dec/)       # -> "Dec"
00352  */
00353 static VALUE
00354 strscan_concat(VALUE self, VALUE str)
00355 {
00356     struct strscanner *p;
00357 
00358     GET_SCANNER(self, p);
00359     StringValue(str);
00360     rb_str_append(p->str, str);
00361     return self;
00362 }
00363 
00364 /*
00365  * Returns the byte position of the scan pointer.  In the 'reset' position, this
00366  * value is zero.  In the 'terminated' position (i.e. the string is exhausted),
00367  * this value is the bytesize of the string.
00368  *
00369  * In short, it's a 0-based index into the string.
00370  *
00371  *   s = StringScanner.new('test string')
00372  *   s.pos               # -> 0
00373  *   s.scan_until /str/  # -> "test str"
00374  *   s.pos               # -> 8
00375  *   s.terminate         # -> #<StringScanner fin>
00376  *   s.pos               # -> 11
00377  */
00378 static VALUE
00379 strscan_get_pos(VALUE self)
00380 {
00381     struct strscanner *p;
00382 
00383     GET_SCANNER(self, p);
00384     return INT2FIX(p->curr);
00385 }
00386 
00387 /*
00388  * call-seq: pos=(n)
00389  *
00390  * Set the byte position of the scan pointer.
00391  *
00392  *   s = StringScanner.new('test string')
00393  *   s.pos = 7            # -> 7
00394  *   s.rest               # -> "ring"
00395  */
00396 static VALUE
00397 strscan_set_pos(VALUE self, VALUE v)
00398 {
00399     struct strscanner *p;
00400     long i;
00401 
00402     GET_SCANNER(self, p);
00403     i = NUM2INT(v);
00404     if (i < 0) i += S_LEN(p);
00405     if (i < 0) rb_raise(rb_eRangeError, "index out of range");
00406     if (i > S_LEN(p)) rb_raise(rb_eRangeError, "index out of range");
00407     p->curr = i;
00408     return INT2NUM(i);
00409 }
00410 
00411 static VALUE
00412 strscan_do_scan(VALUE self, VALUE regex, int succptr, int getstr, int headonly)
00413 {
00414     regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
00415     struct strscanner *p;
00416     regex_t *re;
00417     long ret;
00418     int tmpreg;
00419 
00420     Check_Type(regex, T_REGEXP);
00421     GET_SCANNER(self, p);
00422 
00423     CLEAR_MATCH_STATUS(p);
00424     if (S_RESTLEN(p) < 0) {
00425         return Qnil;
00426     }
00427     re = rb_reg_prepare_re(regex, p->str);
00428     tmpreg = re != RREGEXP(regex)->ptr;
00429     if (!tmpreg) RREGEXP(regex)->usecnt++;
00430 
00431     if (headonly) {
00432         ret = onig_match(re, (UChar* )CURPTR(p),
00433                          (UChar* )(CURPTR(p) + S_RESTLEN(p)),
00434                          (UChar* )CURPTR(p), &(p->regs), ONIG_OPTION_NONE);
00435     }
00436     else {
00437         ret = onig_search(re,
00438                           (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
00439                           (UChar* )CURPTR(p), (UChar* )(CURPTR(p) + S_RESTLEN(p)),
00440                           &(p->regs), ONIG_OPTION_NONE);
00441     }
00442     if (!tmpreg) RREGEXP(regex)->usecnt--;
00443     if (tmpreg) {
00444         if (RREGEXP(regex)->usecnt) {
00445             onig_free(re);
00446         }
00447         else {
00448             onig_free(RREGEXP(regex)->ptr);
00449             RREGEXP(regex)->ptr = re;
00450         }
00451     }
00452 
00453     if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
00454     if (ret < 0) {
00455         /* not matched */
00456         return Qnil;
00457     }
00458 
00459     MATCHED(p);
00460     p->prev = p->curr;
00461     if (succptr) {
00462         p->curr += p->regs.end[0];
00463     }
00464     if (getstr) {
00465         return extract_beg_len(p, p->prev, p->regs.end[0]);
00466     }
00467     else {
00468         return INT2FIX(p->regs.end[0]);
00469     }
00470 }
00471 
00472 /*
00473  * call-seq: scan(pattern) => String
00474  *
00475  * Tries to match with +pattern+ at the current position. If there's a match,
00476  * the scanner advances the "scan pointer" and returns the matched string.
00477  * Otherwise, the scanner returns +nil+.
00478  *
00479  *   s = StringScanner.new('test string')
00480  *   p s.scan(/\w+/)   # -> "test"
00481  *   p s.scan(/\w+/)   # -> nil
00482  *   p s.scan(/\s+/)   # -> " "
00483  *   p s.scan(/\w+/)   # -> "string"
00484  *   p s.scan(/./)     # -> nil
00485  *
00486  */
00487 static VALUE
00488 strscan_scan(VALUE self, VALUE re)
00489 {
00490     return strscan_do_scan(self, re, 1, 1, 1);
00491 }
00492 
00493 /*
00494  * call-seq: match?(pattern)
00495  *
00496  * Tests whether the given +pattern+ is matched from the current scan pointer.
00497  * Returns the length of the match, or +nil+.  The scan pointer is not advanced.
00498  *
00499  *   s = StringScanner.new('test string')
00500  *   p s.match?(/\w+/)   # -> 4
00501  *   p s.match?(/\w+/)   # -> 4
00502  *   p s.match?(/\s+/)   # -> nil
00503  */
00504 static VALUE
00505 strscan_match_p(VALUE self, VALUE re)
00506 {
00507     return strscan_do_scan(self, re, 0, 0, 1);
00508 }
00509 
00510 /*
00511  * call-seq: skip(pattern)
00512  *
00513  * Attempts to skip over the given +pattern+ beginning with the scan pointer.
00514  * If it matches, the scan pointer is advanced to the end of the match, and the
00515  * length of the match is returned.  Otherwise, +nil+ is returned.
00516  *
00517  * It's similar to #scan, but without returning the matched string.
00518  *
00519  *   s = StringScanner.new('test string')
00520  *   p s.skip(/\w+/)   # -> 4
00521  *   p s.skip(/\w+/)   # -> nil
00522  *   p s.skip(/\s+/)   # -> 1
00523  *   p s.skip(/\w+/)   # -> 6
00524  *   p s.skip(/./)     # -> nil
00525  *
00526  */
00527 static VALUE
00528 strscan_skip(VALUE self, VALUE re)
00529 {
00530     return strscan_do_scan(self, re, 1, 0, 1);
00531 }
00532 
00533 /*
00534  * call-seq: check(pattern)
00535  *
00536  * This returns the value that #scan would return, without advancing the scan
00537  * pointer.  The match register is affected, though.
00538  *
00539  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00540  *   s.check /Fri/               # -> "Fri"
00541  *   s.pos                       # -> 0
00542  *   s.matched                   # -> "Fri"
00543  *   s.check /12/                # -> nil
00544  *   s.matched                   # -> nil
00545  *
00546  * Mnemonic: it "checks" to see whether a #scan will return a value.
00547  */
00548 static VALUE
00549 strscan_check(VALUE self, VALUE re)
00550 {
00551     return strscan_do_scan(self, re, 0, 1, 1);
00552 }
00553 
00554 /*
00555  * call-seq: scan_full(pattern, advance_pointer_p, return_string_p)
00556  *
00557  * Tests whether the given +pattern+ is matched from the current scan pointer.
00558  * Advances the scan pointer if +advance_pointer_p+ is true.
00559  * Returns the matched string if +return_string_p+ is true.
00560  * The match register is affected.
00561  *
00562  * "full" means "#scan with full parameters".
00563  */
00564 static VALUE
00565 strscan_scan_full(VALUE self, VALUE re, VALUE s, VALUE f)
00566 {
00567     return strscan_do_scan(self, re, RTEST(s), RTEST(f), 1);
00568 }
00569 
00570 /*
00571  * call-seq: scan_until(pattern)
00572  *
00573  * Scans the string _until_ the +pattern+ is matched.  Returns the substring up
00574  * to and including the end of the match, advancing the scan pointer to that
00575  * location. If there is no match, +nil+ is returned.
00576  *
00577  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00578  *   s.scan_until(/1/)        # -> "Fri Dec 1"
00579  *   s.pre_match              # -> "Fri Dec "
00580  *   s.scan_until(/XYZ/)      # -> nil
00581  */
00582 static VALUE
00583 strscan_scan_until(VALUE self, VALUE re)
00584 {
00585     return strscan_do_scan(self, re, 1, 1, 0);
00586 }
00587 
00588 /*
00589  * call-seq: exist?(pattern)
00590  *
00591  * Looks _ahead_ to see if the +pattern+ exists _anywhere_ in the string,
00592  * without advancing the scan pointer.  This predicates whether a #scan_until
00593  * will return a value.
00594  *
00595  *   s = StringScanner.new('test string')
00596  *   s.exist? /s/            # -> 3
00597  *   s.scan /test/           # -> "test"
00598  *   s.exist? /s/            # -> 2
00599  *   s.exist? /e/            # -> nil
00600  */
00601 static VALUE
00602 strscan_exist_p(VALUE self, VALUE re)
00603 {
00604     return strscan_do_scan(self, re, 0, 0, 0);
00605 }
00606 
00607 /*
00608  * call-seq: skip_until(pattern)
00609  *
00610  * Advances the scan pointer until +pattern+ is matched and consumed.  Returns
00611  * the number of bytes advanced, or +nil+ if no match was found.
00612  *
00613  * Look ahead to match +pattern+, and advance the scan pointer to the _end_
00614  * of the match.  Return the number of characters advanced, or +nil+ if the
00615  * match was unsuccessful.
00616  *
00617  * It's similar to #scan_until, but without returning the intervening string.
00618  *
00619  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00620  *   s.skip_until /12/           # -> 10
00621  *   s                           #
00622  */
00623 static VALUE
00624 strscan_skip_until(VALUE self, VALUE re)
00625 {
00626     return strscan_do_scan(self, re, 1, 0, 0);
00627 }
00628 
00629 /*
00630  * call-seq: check_until(pattern)
00631  *
00632  * This returns the value that #scan_until would return, without advancing the
00633  * scan pointer.  The match register is affected, though.
00634  *
00635  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00636  *   s.check_until /12/          # -> "Fri Dec 12"
00637  *   s.pos                       # -> 0
00638  *   s.matched                   # -> 12
00639  *
00640  * Mnemonic: it "checks" to see whether a #scan_until will return a value.
00641  */
00642 static VALUE
00643 strscan_check_until(VALUE self, VALUE re)
00644 {
00645     return strscan_do_scan(self, re, 0, 1, 0);
00646 }
00647 
00648 /*
00649  * call-seq: search_full(pattern, advance_pointer_p, return_string_p)
00650  *
00651  * Scans the string _until_ the +pattern+ is matched.
00652  * Advances the scan pointer if +advance_pointer_p+, otherwise not.
00653  * Returns the matched string if +return_string_p+ is true, otherwise
00654  * returns the number of bytes advanced.
00655  * This method does affect the match register.
00656  */
00657 static VALUE
00658 strscan_search_full(VALUE self, VALUE re, VALUE s, VALUE f)
00659 {
00660     return strscan_do_scan(self, re, RTEST(s), RTEST(f), 0);
00661 }
00662 
00663 static void
00664 adjust_registers_to_matched(struct strscanner *p)
00665 {
00666     onig_region_clear(&(p->regs));
00667     onig_region_set(&(p->regs), 0, 0, (int)(p->curr - p->prev));
00668 }
00669 
00670 /*
00671  * Scans one character and returns it.
00672  * This method is multibyte character sensitive.
00673  *
00674  *   s = StringScanner.new("ab")
00675  *   s.getch           # => "a"
00676  *   s.getch           # => "b"
00677  *   s.getch           # => nil
00678  *
00679  *   $KCODE = 'EUC'
00680  *   s = StringScanner.new("\244\242")
00681  *   s.getch           # => "\244\242"   # Japanese hira-kana "A" in EUC-JP
00682  *   s.getch           # => nil
00683  */
00684 static VALUE
00685 strscan_getch(VALUE self)
00686 {
00687     struct strscanner *p;
00688     long len;
00689 
00690     GET_SCANNER(self, p);
00691     CLEAR_MATCH_STATUS(p);
00692     if (EOS_P(p))
00693         return Qnil;
00694 
00695     len = rb_enc_mbclen(CURPTR(p), S_PEND(p), rb_enc_get(p->str));
00696     if (p->curr + len > S_LEN(p)) {
00697         len = S_LEN(p) - p->curr;
00698     }
00699     p->prev = p->curr;
00700     p->curr += len;
00701     MATCHED(p);
00702     adjust_registers_to_matched(p);
00703     return extract_range(p, p->prev + p->regs.beg[0],
00704                             p->prev + p->regs.end[0]);
00705 }
00706 
00707 /*
00708  * Scans one byte and returns it.
00709  * This method is not multibyte character sensitive.
00710  * See also: #getch.
00711  *
00712  *   s = StringScanner.new('ab')
00713  *   s.get_byte         # => "a"
00714  *   s.get_byte         # => "b"
00715  *   s.get_byte         # => nil
00716  *
00717  *   $KCODE = 'EUC'
00718  *   s = StringScanner.new("\244\242")
00719  *   s.get_byte         # => "\244"
00720  *   s.get_byte         # => "\242"
00721  *   s.get_byte         # => nil
00722  */
00723 static VALUE
00724 strscan_get_byte(VALUE self)
00725 {
00726     struct strscanner *p;
00727 
00728     GET_SCANNER(self, p);
00729     CLEAR_MATCH_STATUS(p);
00730     if (EOS_P(p))
00731         return Qnil;
00732 
00733     p->prev = p->curr;
00734     p->curr++;
00735     MATCHED(p);
00736     adjust_registers_to_matched(p);
00737     return extract_range(p, p->prev + p->regs.beg[0],
00738                             p->prev + p->regs.end[0]);
00739 }
00740 
00741 /*
00742  * Equivalent to #get_byte.
00743  * This method is obsolete; use #get_byte instead.
00744  */
00745 static VALUE
00746 strscan_getbyte(VALUE self)
00747 {
00748     rb_warning("StringScanner#getbyte is obsolete; use #get_byte instead");
00749     return strscan_get_byte(self);
00750 }
00751 
00752 /*
00753  * call-seq: peek(len)
00754  *
00755  * Extracts a string corresponding to <tt>string[pos,len]</tt>, without
00756  * advancing the scan pointer.
00757  *
00758  *   s = StringScanner.new('test string')
00759  *   s.peek(7)          # => "test st"
00760  *   s.peek(7)          # => "test st"
00761  *
00762  */
00763 static VALUE
00764 strscan_peek(VALUE self, VALUE vlen)
00765 {
00766     struct strscanner *p;
00767     long len;
00768 
00769     GET_SCANNER(self, p);
00770 
00771     len = NUM2LONG(vlen);
00772     if (EOS_P(p))
00773         return infect(str_new(p, "", 0), p);
00774 
00775     if (p->curr + len > S_LEN(p))
00776         len = S_LEN(p) - p->curr;
00777     return extract_beg_len(p, p->curr, len);
00778 }
00779 
00780 /*
00781  * Equivalent to #peek.
00782  * This method is obsolete; use #peek instead.
00783  */
00784 static VALUE
00785 strscan_peep(VALUE self, VALUE vlen)
00786 {
00787     rb_warning("StringScanner#peep is obsolete; use #peek instead");
00788     return strscan_peek(self, vlen);
00789 }
00790 
00791 /*
00792  * Set the scan pointer to the previous position.  Only one previous position is
00793  * remembered, and it changes with each scanning operation.
00794  *
00795  *   s = StringScanner.new('test string')
00796  *   s.scan(/\w+/)        # => "test"
00797  *   s.unscan
00798  *   s.scan(/../)         # => "te"
00799  *   s.scan(/\d/)         # => nil
00800  *   s.unscan             # ScanError: unscan failed: previous match record not exist
00801  */
00802 static VALUE
00803 strscan_unscan(VALUE self)
00804 {
00805     struct strscanner *p;
00806 
00807     GET_SCANNER(self, p);
00808     if (! MATCHED_P(p))
00809         rb_raise(ScanError, "unscan failed: previous match record not exist");
00810     p->curr = p->prev;
00811     CLEAR_MATCH_STATUS(p);
00812     return self;
00813 }
00814 
00815 /*
00816  * Returns +true+ iff the scan pointer is at the beginning of the line.
00817  *
00818  *   s = StringScanner.new("test\ntest\n")
00819  *   s.bol?           # => true
00820  *   s.scan(/te/)
00821  *   s.bol?           # => false
00822  *   s.scan(/st\n/)
00823  *   s.bol?           # => true
00824  *   s.terminate
00825  *   s.bol?           # => true
00826  */
00827 static VALUE
00828 strscan_bol_p(VALUE self)
00829 {
00830     struct strscanner *p;
00831 
00832     GET_SCANNER(self, p);
00833     if (CURPTR(p) > S_PEND(p)) return Qnil;
00834     if (p->curr == 0) return Qtrue;
00835     return (*(CURPTR(p) - 1) == '\n') ? Qtrue : Qfalse;
00836 }
00837 
00838 /*
00839  * Returns +true+ if the scan pointer is at the end of the string.
00840  *
00841  *   s = StringScanner.new('test string')
00842  *   p s.eos?          # => false
00843  *   s.scan(/test/)
00844  *   p s.eos?          # => false
00845  *   s.terminate
00846  *   p s.eos?          # => true
00847  */
00848 static VALUE
00849 strscan_eos_p(VALUE self)
00850 {
00851     struct strscanner *p;
00852 
00853     GET_SCANNER(self, p);
00854     return EOS_P(p) ? Qtrue : Qfalse;
00855 }
00856 
00857 /*
00858  * Equivalent to #eos?.
00859  * This method is obsolete, use #eos? instead.
00860  */
00861 static VALUE
00862 strscan_empty_p(VALUE self)
00863 {
00864     rb_warning("StringScanner#empty? is obsolete; use #eos? instead");
00865     return strscan_eos_p(self);
00866 }
00867 
00868 /*
00869  * Returns true iff there is more data in the string.  See #eos?.
00870  * This method is obsolete; use #eos? instead.
00871  *
00872  *   s = StringScanner.new('test string')
00873  *   s.eos?              # These two
00874  *   s.rest?             # are opposites.
00875  */
00876 static VALUE
00877 strscan_rest_p(VALUE self)
00878 {
00879     struct strscanner *p;
00880 
00881     GET_SCANNER(self, p);
00882     return EOS_P(p) ? Qfalse : Qtrue;
00883 }
00884 
00885 /*
00886  * Returns +true+ iff the last match was successful.
00887  *
00888  *   s = StringScanner.new('test string')
00889  *   s.match?(/\w+/)     # => 4
00890  *   s.matched?          # => true
00891  *   s.match?(/\d+/)     # => nil
00892  *   s.matched?          # => false
00893  */
00894 static VALUE
00895 strscan_matched_p(VALUE self)
00896 {
00897     struct strscanner *p;
00898 
00899     GET_SCANNER(self, p);
00900     return MATCHED_P(p) ? Qtrue : Qfalse;
00901 }
00902 
00903 /*
00904  * Returns the last matched string.
00905  *
00906  *   s = StringScanner.new('test string')
00907  *   s.match?(/\w+/)     # -> 4
00908  *   s.matched           # -> "test"
00909  */
00910 static VALUE
00911 strscan_matched(VALUE self)
00912 {
00913     struct strscanner *p;
00914 
00915     GET_SCANNER(self, p);
00916     if (! MATCHED_P(p)) return Qnil;
00917     return extract_range(p, p->prev + p->regs.beg[0],
00918                             p->prev + p->regs.end[0]);
00919 }
00920 
00921 /*
00922  * Returns the size of the most recent match (see #matched), or +nil+ if there
00923  * was no recent match.
00924  *
00925  *   s = StringScanner.new('test string')
00926  *   s.check /\w+/           # -> "test"
00927  *   s.matched_size          # -> 4
00928  *   s.check /\d+/           # -> nil
00929  *   s.matched_size          # -> nil
00930  */
00931 static VALUE
00932 strscan_matched_size(VALUE self)
00933 {
00934     struct strscanner *p;
00935 
00936     GET_SCANNER(self, p);
00937     if (! MATCHED_P(p)) return Qnil;
00938     return INT2NUM(p->regs.end[0] - p->regs.beg[0]);
00939 }
00940 
00941 /*
00942  * call-seq: [](n)
00943  *
00944  * Return the n-th subgroup in the most recent match.
00945  *
00946  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
00947  *   s.scan(/(\w+) (\w+) (\d+) /)       # -> "Fri Dec 12 "
00948  *   s[0]                               # -> "Fri Dec 12 "
00949  *   s[1]                               # -> "Fri"
00950  *   s[2]                               # -> "Dec"
00951  *   s[3]                               # -> "12"
00952  *   s.post_match                       # -> "1975 14:39"
00953  *   s.pre_match                        # -> ""
00954  */
00955 static VALUE
00956 strscan_aref(VALUE self, VALUE idx)
00957 {
00958     struct strscanner *p;
00959     long i;
00960 
00961     GET_SCANNER(self, p);
00962     if (! MATCHED_P(p))        return Qnil;
00963 
00964     i = NUM2LONG(idx);
00965     if (i < 0)
00966         i += p->regs.num_regs;
00967     if (i < 0)                 return Qnil;
00968     if (i >= p->regs.num_regs) return Qnil;
00969     if (p->regs.beg[i] == -1)  return Qnil;
00970 
00971     return extract_range(p, p->prev + p->regs.beg[i],
00972                             p->prev + p->regs.end[i]);
00973 }
00974 
00975 /*
00976  * Return the <i><b>pre</b>-match</i> (in the regular expression sense) of the last scan.
00977  *
00978  *   s = StringScanner.new('test string')
00979  *   s.scan(/\w+/)           # -> "test"
00980  *   s.scan(/\s+/)           # -> " "
00981  *   s.pre_match             # -> "test"
00982  *   s.post_match            # -> "string"
00983  */
00984 static VALUE
00985 strscan_pre_match(VALUE self)
00986 {
00987     struct strscanner *p;
00988 
00989     GET_SCANNER(self, p);
00990     if (! MATCHED_P(p)) return Qnil;
00991     return extract_range(p, 0, p->prev + p->regs.beg[0]);
00992 }
00993 
00994 /*
00995  * Return the <i><b>post</b>-match</i> (in the regular expression sense) of the last scan.
00996  *
00997  *   s = StringScanner.new('test string')
00998  *   s.scan(/\w+/)           # -> "test"
00999  *   s.scan(/\s+/)           # -> " "
01000  *   s.pre_match             # -> "test"
01001  *   s.post_match            # -> "string"
01002  */
01003 static VALUE
01004 strscan_post_match(VALUE self)
01005 {
01006     struct strscanner *p;
01007 
01008     GET_SCANNER(self, p);
01009     if (! MATCHED_P(p)) return Qnil;
01010     return extract_range(p, p->prev + p->regs.end[0], S_LEN(p));
01011 }
01012 
01013 /*
01014  * Returns the "rest" of the string (i.e. everything after the scan pointer).
01015  * If there is no more data (eos? = true), it returns <tt>""</tt>.
01016  */
01017 static VALUE
01018 strscan_rest(VALUE self)
01019 {
01020     struct strscanner *p;
01021 
01022     GET_SCANNER(self, p);
01023     if (EOS_P(p)) {
01024         return infect(str_new(p, "", 0), p);
01025     }
01026     return extract_range(p, p->curr, S_LEN(p));
01027 }
01028 
01029 /*
01030  * <tt>s.rest_size</tt> is equivalent to <tt>s.rest.size</tt>.
01031  */
01032 static VALUE
01033 strscan_rest_size(VALUE self)
01034 {
01035     struct strscanner *p;
01036     long i;
01037 
01038     GET_SCANNER(self, p);
01039     if (EOS_P(p)) {
01040         return INT2FIX(0);
01041     }
01042     i = S_LEN(p) - p->curr;
01043     return INT2FIX(i);
01044 }
01045 
01046 /*
01047  * <tt>s.restsize</tt> is equivalent to <tt>s.rest_size</tt>.
01048  * This method is obsolete; use #rest_size instead.
01049  */
01050 static VALUE
01051 strscan_restsize(VALUE self)
01052 {
01053     rb_warning("StringScanner#restsize is obsolete; use #rest_size instead");
01054     return strscan_rest_size(self);
01055 }
01056 
01057 #define INSPECT_LENGTH 5
01058 #define BUFSIZE 256
01059 
01060 /*
01061  * Returns a string that represents the StringScanner object, showing:
01062  * - the current position
01063  * - the size of the string
01064  * - the characters surrounding the scan pointer
01065  *
01066  *   s = StringScanner.new("Fri Dec 12 1975 14:39")
01067  *   s.inspect            # -> '#<StringScanner 0/21 @ "Fri D...">'
01068  *   s.scan_until /12/    # -> "Fri Dec 12"
01069  *   s.inspect            # -> '#<StringScanner 10/21 "...ec 12" @ " 1975...">'
01070  */
01071 static VALUE
01072 strscan_inspect(VALUE self)
01073 {
01074     struct strscanner *p;
01075     VALUE a, b;
01076 
01077     Data_Get_Struct(self, struct strscanner, p);
01078     if (NIL_P(p->str)) {
01079         a = rb_sprintf("#<%"PRIsVALUE" (uninitialized)>", RB_OBJ_CLASSNAME(self));
01080         return infect(a, p);
01081     }
01082     if (EOS_P(p)) {
01083         a = rb_sprintf("#<%"PRIsVALUE" fin>", RB_OBJ_CLASSNAME(self));
01084         return infect(a, p);
01085     }
01086     if (p->curr == 0) {
01087         b = inspect2(p);
01088         a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld @ %"PRIsVALUE">",
01089                        RB_OBJ_CLASSNAME(self),
01090                        p->curr, S_LEN(p),
01091                        RB_OBJ_STRING(b));
01092         return infect(a, p);
01093     }
01094     a = inspect1(p);
01095     b = inspect2(p);
01096     a = rb_sprintf("#<%"PRIsVALUE" %ld/%ld %"PRIsVALUE" @ %"PRIsVALUE">",
01097                    RB_OBJ_CLASSNAME(self),
01098                    p->curr, S_LEN(p),
01099                    RB_OBJ_STRING(a), RB_OBJ_STRING(b));
01100     return infect(a, p);
01101 }
01102 
01103 static VALUE
01104 inspect1(struct strscanner *p)
01105 {
01106     char buf[BUFSIZE];
01107     char *bp = buf;
01108     long len;
01109 
01110     if (p->curr == 0) return rb_str_new2("");
01111     if (p->curr > INSPECT_LENGTH) {
01112         strcpy(bp, "..."); bp += 3;
01113         len = INSPECT_LENGTH;
01114     }
01115     else {
01116         len = p->curr;
01117     }
01118     memcpy(bp, CURPTR(p) - len, len); bp += len;
01119     return rb_str_dump(rb_str_new(buf, bp - buf));
01120 }
01121 
01122 static VALUE
01123 inspect2(struct strscanner *p)
01124 {
01125     VALUE str;
01126     long len;
01127 
01128     if (EOS_P(p)) return rb_str_new2("");
01129     len = S_LEN(p) - p->curr;
01130     if (len > INSPECT_LENGTH) {
01131         str = rb_str_new(CURPTR(p), INSPECT_LENGTH);
01132         rb_str_cat2(str, "...");
01133     }
01134     else {
01135         str = rb_str_new(CURPTR(p), len);
01136     }
01137     return rb_str_dump(str);
01138 }
01139 
01140 /* =======================================================================
01141                               Ruby Interface
01142    ======================================================================= */
01143 
01144 /*
01145  * Document-class: StringScanner
01146  *
01147  * StringScanner provides for lexical scanning operations on a String.  Here is
01148  * an example of its usage:
01149  *
01150  *   s = StringScanner.new('This is an example string')
01151  *   s.eos?               # -> false
01152  *
01153  *   p s.scan(/\w+/)      # -> "This"
01154  *   p s.scan(/\w+/)      # -> nil
01155  *   p s.scan(/\s+/)      # -> " "
01156  *   p s.scan(/\s+/)      # -> nil
01157  *   p s.scan(/\w+/)      # -> "is"
01158  *   s.eos?               # -> false
01159  *
01160  *   p s.scan(/\s+/)      # -> " "
01161  *   p s.scan(/\w+/)      # -> "an"
01162  *   p s.scan(/\s+/)      # -> " "
01163  *   p s.scan(/\w+/)      # -> "example"
01164  *   p s.scan(/\s+/)      # -> " "
01165  *   p s.scan(/\w+/)      # -> "string"
01166  *   s.eos?               # -> true
01167  *
01168  *   p s.scan(/\s+/)      # -> nil
01169  *   p s.scan(/\w+/)      # -> nil
01170  *
01171  * Scanning a string means remembering the position of a <i>scan pointer</i>,
01172  * which is just an index.  The point of scanning is to move forward a bit at
01173  * a time, so matches are sought after the scan pointer; usually immediately
01174  * after it.
01175  *
01176  * Given the string "test string", here are the pertinent scan pointer
01177  * positions:
01178  *
01179  *     t e s t   s t r i n g
01180  *   0 1 2 ...             1
01181  *                         0
01182  *
01183  * When you #scan for a pattern (a regular expression), the match must occur
01184  * at the character after the scan pointer.  If you use #scan_until, then the
01185  * match can occur anywhere after the scan pointer.  In both cases, the scan
01186  * pointer moves <i>just beyond</i> the last character of the match, ready to
01187  * scan again from the next character onwards.  This is demonstrated by the
01188  * example above.
01189  *
01190  * == Method Categories
01191  *
01192  * There are other methods besides the plain scanners.  You can look ahead in
01193  * the string without actually scanning.  You can access the most recent match.
01194  * You can modify the string being scanned, reset or terminate the scanner,
01195  * find out or change the position of the scan pointer, skip ahead, and so on.
01196  *
01197  * === Advancing the Scan Pointer
01198  *
01199  * - #getch
01200  * - #get_byte
01201  * - #scan
01202  * - #scan_until
01203  * - #skip
01204  * - #skip_until
01205  *
01206  * === Looking Ahead
01207  *
01208  * - #check
01209  * - #check_until
01210  * - #exist?
01211  * - #match?
01212  * - #peek
01213  *
01214  * === Finding Where we Are
01215  *
01216  * - #beginning_of_line? (#bol?)
01217  * - #eos?
01218  * - #rest?
01219  * - #rest_size
01220  * - #pos
01221  *
01222  * === Setting Where we Are
01223  *
01224  * - #reset
01225  * - #terminate
01226  * - #pos=
01227  *
01228  * === Match Data
01229  *
01230  * - #matched
01231  * - #matched?
01232  * - #matched_size
01233  * - []
01234  * - #pre_match
01235  * - #post_match
01236  *
01237  * === Miscellaneous
01238  *
01239  * - <<
01240  * - #concat
01241  * - #string
01242  * - #string=
01243  * - #unscan
01244  *
01245  * There are aliases to several of the methods.
01246  */
01247 void
01248 Init_strscan()
01249 {
01250     ID id_scanerr = rb_intern("ScanError");
01251     VALUE tmp;
01252 
01253     StringScanner = rb_define_class("StringScanner", rb_cObject);
01254     ScanError = rb_define_class_under(StringScanner, "Error", rb_eStandardError);
01255     if (!rb_const_defined(rb_cObject, id_scanerr)) {
01256         rb_const_set(rb_cObject, id_scanerr, ScanError);
01257     }
01258     tmp = rb_str_new2(STRSCAN_VERSION);
01259     rb_obj_freeze(tmp);
01260     rb_const_set(StringScanner, rb_intern("Version"), tmp);
01261     tmp = rb_str_new2("$Id$");
01262     rb_obj_freeze(tmp);
01263     rb_const_set(StringScanner, rb_intern("Id"), tmp);
01264 
01265     rb_define_alloc_func(StringScanner, strscan_s_allocate);
01266     rb_define_private_method(StringScanner, "initialize", strscan_initialize, -1);
01267     rb_define_private_method(StringScanner, "initialize_copy", strscan_init_copy, 1);
01268     rb_define_singleton_method(StringScanner, "must_C_version", strscan_s_mustc, 0);
01269     rb_define_method(StringScanner, "reset",       strscan_reset,       0);
01270     rb_define_method(StringScanner, "terminate",   strscan_terminate,   0);
01271     rb_define_method(StringScanner, "clear",       strscan_clear,       0);
01272     rb_define_method(StringScanner, "string",      strscan_get_string,  0);
01273     rb_define_method(StringScanner, "string=",     strscan_set_string,  1);
01274     rb_define_method(StringScanner, "concat",      strscan_concat,      1);
01275     rb_define_method(StringScanner, "<<",          strscan_concat,      1);
01276     rb_define_method(StringScanner, "pos",         strscan_get_pos,     0);
01277     rb_define_method(StringScanner, "pos=",        strscan_set_pos,     1);
01278     rb_define_method(StringScanner, "pointer",     strscan_get_pos,     0);
01279     rb_define_method(StringScanner, "pointer=",    strscan_set_pos,     1);
01280 
01281     rb_define_method(StringScanner, "scan",        strscan_scan,        1);
01282     rb_define_method(StringScanner, "skip",        strscan_skip,        1);
01283     rb_define_method(StringScanner, "match?",      strscan_match_p,     1);
01284     rb_define_method(StringScanner, "check",       strscan_check,       1);
01285     rb_define_method(StringScanner, "scan_full",   strscan_scan_full,   3);
01286 
01287     rb_define_method(StringScanner, "scan_until",  strscan_scan_until,  1);
01288     rb_define_method(StringScanner, "skip_until",  strscan_skip_until,  1);
01289     rb_define_method(StringScanner, "exist?",      strscan_exist_p,     1);
01290     rb_define_method(StringScanner, "check_until", strscan_check_until, 1);
01291     rb_define_method(StringScanner, "search_full", strscan_search_full, 3);
01292 
01293     rb_define_method(StringScanner, "getch",       strscan_getch,       0);
01294     rb_define_method(StringScanner, "get_byte",    strscan_get_byte,    0);
01295     rb_define_method(StringScanner, "getbyte",     strscan_getbyte,     0);
01296     rb_define_method(StringScanner, "peek",        strscan_peek,        1);
01297     rb_define_method(StringScanner, "peep",        strscan_peep,        1);
01298 
01299     rb_define_method(StringScanner, "unscan",      strscan_unscan,      0);
01300 
01301     rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0);
01302     rb_alias(StringScanner, rb_intern("bol?"), rb_intern("beginning_of_line?"));
01303     rb_define_method(StringScanner, "eos?",        strscan_eos_p,       0);
01304     rb_define_method(StringScanner, "empty?",      strscan_empty_p,     0);
01305     rb_define_method(StringScanner, "rest?",       strscan_rest_p,      0);
01306 
01307     rb_define_method(StringScanner, "matched?",    strscan_matched_p,   0);
01308     rb_define_method(StringScanner, "matched",     strscan_matched,     0);
01309     rb_define_method(StringScanner, "matched_size", strscan_matched_size, 0);
01310     rb_define_method(StringScanner, "[]",          strscan_aref,        1);
01311     rb_define_method(StringScanner, "pre_match",   strscan_pre_match,   0);
01312     rb_define_method(StringScanner, "post_match",  strscan_post_match,  0);
01313 
01314     rb_define_method(StringScanner, "rest",        strscan_rest,        0);
01315     rb_define_method(StringScanner, "rest_size",   strscan_rest_size,   0);
01316     rb_define_method(StringScanner, "restsize",    strscan_restsize,    0);
01317 
01318     rb_define_method(StringScanner, "inspect",     strscan_inspect,     0);
01319 }
01320