encode & decode in Python

#python #encode #decode #string

*Memo:

My post explains string, bytes and bytearray functions.

str.encode() can encode the string to a bytes and bytes.decode() and bytearray.decode() can decode the bytes and bytearray to a string respectively as shown below:

*Memo:

The 1st argument is encoding(Optional-Default:'utf-8'-Type:str):
- It decides encoding or decoding standard.
- 'utf-8', 'utf-7', 'utf-16', 'big5', 'ascii', etc can be set to it.
- You can see Standard Encodings for more possible values.
The 2nd argument is errors(Optional-Default:'strict'-Type:str):
- It controls encoding or decoding error with the error handlers, 'strict', 'ignore', 'replace', 'xmlcharrefreplace', 'backslashreplace', etc.
- 'strict' raises UnicodeError if the character and byte, which cannot be encoded and decoded, exists respectively.
- 'ignore' ignores the character and byte which cannot be encoded and decoded respectively.
- 'replace' replaces the character and byte, which cannot be encoded and decoded, with ? for encoding and � for decoding respectively.
- 'xmlcharrefreplace' replaces the character, which cannot be encoded, with the XML/HTML numeric character reference format &#num;:
  - It doesn't support decoding so error occurs for the byte which cannot be decoded while error doesn't occur for the byte which can be decoded.
- 'backslashreplace' replaces the character and byte, which cannot be encoded and decoded, with the hexadecimal format \xhh, \uxxxx or \Uxxxxxxxx for encoding and \xhh for decoding respectively.
- You can see more error handlers.
- You can create your own error handler with codecs.register_error().
After using encode(), decode() appears and encode() disappears.
After using decode(), encode() appears and decode() disappears.

v = "Lёт's gφ!" # Let's go!

b = v.encode()
b = v.encode(encoding='utf-8', errors='strict')
b = v.encode(encoding='utf-8')
b = v.encode(errors='strict')
ba = bytearray(v.encode())

print(b)
# b"L\xd1\x91\xd1\x82's g\xcf\x86!"

print(ba)
# bytearray(b"L\xd1\x91\xd1\x82\'s g\xcf\x86!")

v = b.decode()
v = b.decode(encoding='utf-8', errors='strict')
v = b.decode(encoding='utf-8')
v = b.decode(errors='strict')
v = ba.decode()
v = ba.decode(encoding='utf-8', errors='strict')
v = ba.decode(encoding='utf-8')
v = ba.decode(errors='strict')

print(v)
# Lёт's gφ!

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='utf-7')
ba = bytearray(v.encode(encoding='utf-7'))

print(b)
# b"L+BFEEQg's g+A8Y!"

print(ba)
# bytearray(b"L+BFEEQg\'s g+A8Y!")

v = b.decode(encoding='utf-7')
v = ba.decode(encoding='utf-7')

print(v)
# Lёт's gφ!

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='utf-16')
ba = bytearray(v.encode(encoding='utf-16'))

print(b)
# b"\xff\xfeL\x00Q\x04B\x04'\x00s\x00 \x00g\x00\xc6\x03!\x00"

print(ba)
# bytearray(b"\xff\xfeL\x00Q\x04B\x04\'\x00s\x00 \x00g\x00\xc6\x03!\x00")

v = b.decode(encoding='utf-16')
v = ba.decode(encoding='utf-16')

print(v)
# Lёт's gφ!

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='big5')
ba = bytearray(v.encode(encoding='big5'))

print(b)
# b"L\xc7\xce\xc7\xdb's g\xa3p!"

print(ba)
# bytearray(b"L\xc7\xce\xc7\xdb\'s g\xa3p!")

v = b.decode(encoding='big5')
v = ba.decode(encoding='big5')

print(v)
# Lёт's gφ!

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='ascii', errors='ignore')
ba = bytearray(v.encode(encoding='ascii', errors='ignore'))

print(b)
# b"L's g!"

print(ba)
# bytearray(b"L\'s g!")

print(b.decode(encoding='ascii', errors='ignore'))
print(ba.decode(encoding='ascii', errors='ignore'))
# L's g!

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='ascii', errors='replace')
ba = bytearray(v.encode(encoding='ascii', errors='replace'))

print(b)
# b"L??'s g?!"

print(ba)
# bytearray(b"L??\'s g?!")

print(b.decode(encoding='ascii', errors='replace'))
print(ba.decode(encoding='ascii', errors='replace'))
# L??'s g?!

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='ascii', errors='xmlcharrefreplace')
ba = bytearray(v.encode(encoding='ascii', errors='xmlcharrefreplace'))

print(b)
# b"L&#1105;&#1090;'s g&#966;!"

print(ba)
# bytearray(b"L&#1105;&#1090;\'s g&#966;!")

print(b.decode(encoding='ascii', errors='xmlcharrefreplace'))
print(ba.decode(encoding='ascii', errors='xmlcharrefreplace'))
# L&#1105;&#1090;'s g&#966;!

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='ascii', errors='backslashreplace')
ba = bytearray(v.encode(encoding='ascii', errors='backslashreplace'))

print(b)
# b"L\\u0451\\u0442's g\\u03c6!"

print(ba)
# bytearray(b"L\\u0451\\u0442\'s g\\u03c6!")

print(b.decode(encoding='ascii', errors='backslashreplace'))
print(ba.decode(encoding='ascii', errors='backslashreplace'))
# L\u0451\u0442's g\u03c6!

import codecs

def hashreplace_handler(x):
    return ((x.end - x.start) * '#', x.end)

codecs.register_error('hashreplace', hashreplace_handler)

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='ascii', errors='hashreplace')
ba = bytearray(v.encode(encoding='ascii', errors='hashreplace'))

print(b)
# b"L##'s g#!"

print(ba)
# bytearray(b"L##\'s g#!")

print(b.decode(encoding='ascii', errors='hashreplace'))
print(ba.decode(encoding='ascii', errors='hashreplace'))
# L##'s g#!

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='ascii', errors='strict')
ba = bytearray(v.encode(encoding='ascii', errors='strict'))
# UnicodeEncodeError: 'ascii' codec can't encode characters
# in position 1-2: ordinal not in range(128)

import codecs

def hashreplace_handler(x):
    return ((x.end - x.start) * '#', x.end)

codecs.register_error('hashreplace', hashreplace_handler)

v = "Lёт's gφ!" # Let's go!

b = v.encode(encoding='utf-8', errors='strict')
ba = bytearray(v.encode(encoding='utf-8', errors='strict'))

print(b)
# b"L\xd1\x91\xd1\x82's g\xcf\x86!"

print(ba)
# bytearray(b"L\xd1\x91\xd1\x82\'s g\xcf\x86!")

print(b.decode(encoding='ascii', errors='ignore'))
print(ba.decode(encoding='ascii', errors='ignore'))
# L's g!

print(b.decode(encoding='ascii', errors='replace'))
print(ba.decode(encoding='ascii', errors='replace'))
# L����'s g��!

print(b.decode(encoding='ascii', errors='backslashreplace'))
print(ba.decode(encoding='ascii', errors='backslashreplace'))
# L\xd1\x91\xd1\x82's g\xcf\x86!

print(b.decode(encoding='ascii', errors='hashreplace'))
print(ba.decode(encoding='ascii', errors='hashreplace'))
# L####'s g##!

print(b.decode(encoding='ascii', errors='strict'))
print(ba.decode(encoding='ascii', errors='strict'))
# UnicodeDecodeError: 'ascii' codec can't decode byte 0xd1
# in position 1: ordinal not in range(128)

print(b.decode(encoding='ascii', errors='xmlcharrefreplace'))
print(ba.decode(encoding='ascii', errors='xmlcharrefreplace'))
# TypeError: don't know how to handle UnicodeDecodeError in error callback

DEV Community

encode & decode in Python

Top comments (0)