String
String
  • Strings of characters, immutable
  • s = 'Hello World!'
     
    #access element
    print(s[0]) #H
     
    #slicing
    print(s[:2]) #He
    
    #slice
    sl = slice(0, 10, 2)
    print(s[sl]) #HloWr
     
    #concatenation
    print(s+' ...')
     
    #repeat
    print(s*2)
     
    #in
    if 'H' in s:
        print('Contain H')
     
    #raw string, suppresses actual meaning of escape characters
    print(r'raw string\n')
    s = 'raw string\n'
    print('%r' % s) # output string as raw string
    		
    s = 'Hello World!'
     
    #capitalize, capitalizes first letter of string
    print(s.capitalize())
     
    #center
    print(s.center(20, '*'))
     
    #count
    print(s.count('l'))
     
    #endswith
    if s.endswith('!'):
        print('end with ! ...')
     
    #find
    print(s.find('or')) #7
    
    #index, find a string and raise an exception if the string is not found
    print(s.index('or'))
    
    #join
    c = '-'
    print(c.join(['a', 'b', 'c']))
    
    #lower
    print(s.lower())
    
    #replace
    print(s.replace('l', '-')) # replace all occurrences
    print(s.replace('l', '-', 2)) # replace at most max occurrences
    
    #split
    str = "Line1-abcdef, \nLine2-abc, \nLine4-abcd"
    print(str.split())
    
    import re
    # use multiple delimiter
    print(re.split('\n|, ',str)) # ['Line1-abcdef', '', 'Line2-abc', '', 'Line4-abcd']
    
    #strip
    print('  Hello ... '.strip())
    
    #upper
    print(s.upper()) #HELLO WORLD!
    		
    # Template
    from string import Template
    s = Template('$fname, $lname, $fname')
    
    sub = s.substitute(fname='Lin', lname='Chen')
    
    print(sub) # Lin, Chen, Lin
            
    Regular Expression
  • Pattern
  • Flags
  • import re
     
    phone = "2004-959-559 # This is Phone Number"
     
    # match
    # match RE pattern to string
    # checks for a match only at the beginning of the string
    m = re.match(r'(\d+)-(\d+)-\d+.*', phone) # use () to group matches
    if m:
        print(type(m))
        print(m.group()) #2004-959-559 # This is Phone Number
        print(m.group(1)) # 2004
        print(m.groups()) # ('2004', '959')
     
    # search
    # searches for first occurrence of RE pattern
    # checks for a match anywhere in the string
    s = re.search(r'\d+', phone)
    if s:
        print(type(s))
        print(s.group()) #2004
     
    #findall
    a = re.findall(r'\d+', phone)
    print(a) #['2004', '959', '559']
     
    #replace
    r = re.sub(r'\d', '*', phone)
    print(r) # ****-***-***, This is Phone Number
    		
    Unicode
  • a sequence of code points, immutable
  • Python keep characters as unicode in memory
  • type 'str' represents unicode in Python 3, type 'bytes' represent byte string
  • byte string can only contain ASCII literal characters
    1. Working with binary data, such as images, or audio files
    2. When sending or receiving data over network sockets
    3. When reading from or writing to binary files, such as reading an image file
    4. Cryptographic operations often work with binary data, and byte strings are used to hold the binary input, output, and intermediate values in these operations
    5. In some cases, using byte strings can be more memory-efficient and faster than using Unicode strings
  • # unicode and str
    # same in Python 3
    s = 'Café' # str
    s = u'Café' # str
    
    # byte string
    s = b'lin' # bytes
    
    # str to byte string
    s = 'Café'
    s.encode('utf-8') # utf-8 is the default encode standard
    
    # byte to str
    s = b'Caf\xc3\xa9'
    s.decode('utf-8')
    
    # get unicode code point
    c = ord(u'陈') #38472, int
    chr(38472)) #陈, str
            

  • read str, output str
    f = open('temp.txt', 'r') # read str
    l = next(f)
    
    o = open('output.txt', 'w') # write str
    o.write(l)
    
    o.close()
    f.close()
    		
  • read str, output byte string
    f = open('temp.txt', 'r') # read str
    l = next(f)
     
    o = open('output.txt', 'wb') # write byte string
    o.write(l.encode('utf-8')) 
     
    o.close()
    f.close()
            
  • read byte string, output str
    f = open('temp.txt', 'rb') # read byte string
    l = next(f)
     
    o = open('output.txt', 'w') # write str
    o.write(l.decode('utf-8')) 
     
    o.close()
    f.close()
    		
  • read byte string, output byte string
    f = open('temp.txt', 'rb') # read byte string
    l = next(f)
     
    o = open('output.txt', 'wb') # write byte string
    o.write(l) 
     
    o.close()
    f.close()
    		
  • Reference
  • Strings, Unicode, and Bytes in Python 3: Everything You Always Wanted to Know
  • Python 3 Standard Library
  • Tutorialspoint String
  • Regular Expression
  • Unicode cheat sheet