Problem: Packing and unpacking gzip files.
Go has the gzip package that makes working with gzip files easier.
Writer
from the gzip
package can be used to compress data, and as you'd expect, Reader
is used to read-back original data from a compressed gzip file.
Initial Approach
Natural instinct would be to read the input file content, compress/decompress the data with Writer/Reader respectively, and then write the resulting content to the output file.
package main
import (
"bytes"
"compress/gzip"
"fmt"
"io"
"io/ioutil"
"os"
"strings"
)
func packGzipFile(srcFilePath, dstFilePath string) error {
inputBytes, err := ioutil.ReadFile(srcFilePath)
if err != nil {
fmt.Println("Error while opening input file :", srcFilePath, "Error :", err)
return err
}
var compressedByteBuffer bytes.Buffer
gzWriter := gzip.NewWriter(&compressedByteBuffer)
_, err = gzWriter.Write(inputBytes)
if err != nil {
fmt.Println("Unable to perform write operation :", err)
return err
}
gzWriter.Close()
compressedData := compressedByteBuffer.Bytes()
err = ioutil.WriteFile(dstFilePath, compressedData, 0660)
if err != nil {
fmt.Println("Error while writing to file :", dstFilePath, "Error :", err)
return err
}
return nil
}
func unpackGzipFile(srcFilePath, dstFilePath string) error {
data, err := ioutil.ReadFile(srcFilePath)
b := bytes.NewBuffer(data)
var r io.Reader
r, err = gzip.NewReader(b)
if err != nil {
fmt.Println("Error while creating a reader :", err)
return err
}
var resB bytes.Buffer
_, err = resB.ReadFrom(r)
if err != nil {
fmt.Println("Error while reading :", err)
return err
}
resData := resB.Bytes()
err = ioutil.WriteFile(dstFilePath, resData, 0660)
if err != nil {
fmt.Println("Error while writing :", err)
return err
}
return nil
}
func main() {
if len(os.Args) < 4 {
fmt.Println("USAGE: ./gztool <c | d> <input_filename> <output_filename>")
return
}
mode := strings.ToLower(os.Args[1])
inputFilename := os.Args[2]
outputFilename := os.Args[3]
switch mode {
case "c":
packGzipFile(inputFilename, outputFilename)
case "d":
unpackGzipFile(inputFilename, outputFilename)
default:
fmt.Println("Invalid mode. Use \"c\" or \"d\"")
}
}
This approach would start to falter (in terms of memory usage) as the size of the file that we're working on increases.
Let's add profiling to get some information on CPU and Memory Usage.
Profiling features are built into Go.
To make things simpler, I've used the profile package for profiling.
package main
import (
"bytes"
"compress/gzip"
"fmt"
"io"
"io/ioutil"
"os"
"strings"
"github.com/pkg/profile"
)
func packGzipFile(srcFilePath, dstFilePath string) error {
inputBytes, err := ioutil.ReadFile(srcFilePath)
if err != nil {
fmt.Println("Error while opening input file :", srcFilePath, "Error :", err)
return err
}
var compressedByteBuffer bytes.Buffer
gzWriter := gzip.NewWriter(&compressedByteBuffer)
_, err = gzWriter.Write(inputBytes)
if err != nil {
fmt.Println("Unable to perform write operation :", err)
return err
}
gzWriter.Close()
compressedData := compressedByteBuffer.Bytes()
err = ioutil.WriteFile(dstFilePath, compressedData, 0660)
if err != nil {
fmt.Println("Error while writing to file :", dstFilePath, "Error :", err)
return err
}
return nil
}
func unpackGzipFile(srcFilePath, dstFilePath string) error {
data, err := ioutil.ReadFile(srcFilePath)
b := bytes.NewBuffer(data)
var r io.Reader
r, err = gzip.NewReader(b)
if err != nil {
fmt.Println("Error while creating a reader :", err)
return err
}
var resB bytes.Buffer
_, err = resB.ReadFrom(r)
if err != nil {
fmt.Println("Error while reading :", err)
return err
}
resData := resB.Bytes()
err = ioutil.WriteFile(dstFilePath, resData, 0660)
if err != nil {
fmt.Println("Error while writing :", err)
return err
}
return nil
}
func main() {
if len(os.Args) < 4 {
fmt.Println("USAGE: ./gztool <c | d> <input_filename> <output_filename> [cpu | mem]")
return
}
if len(os.Args) == 5 {
switch os.Args[4] {
case "cpu":
defer profile.Start().Stop()
case "mem":
defer profile.Start(profile.MemProfile).Stop()
}
}
mode := strings.ToLower(os.Args[1])
inputFilename := os.Args[2]
outputFilename := os.Args[3]
switch mode {
case "c":
packGzipFile(inputFilename, outputFilename)
case "d":
unpackGzipFile(inputFilename, outputFilename)
default:
fmt.Println("Invalid mode. Use \"c\" or \"d\"")
}
}
Stats for a 200 MB file:
Action | CPU Time | Memory Usage |
---|---|---|
Packing | 6.01 s | 287.87 MB |
Unpacking | 705.24 ms | 286.84 MB |
Not only is memory usage too high, but the process would also exit with an Insufficient Memory
panic if the size of the file is too big.
Efficient Approach
An efficient approach would be to pipe the contents of the file to the Writer/Reader. This way, only a small chunk of the actual file needs to be in the memory.
Thanks to goroutines, we can write data concurrently to one end of the pipe and read the data at the other.
package main
import (
"compress/gzip"
"fmt"
"io"
"os"
"strings"
"github.com/pkg/profile"
)
func packGzipFile(srcFilePath, dstFilePath string) error {
srcFile, err := os.Open(srcFilePath)
if err != nil {
fmt.Println("Error while opening source file :", err)
return err
}
dstFile, err := os.OpenFile(dstFilePath, os.O_CREATE|os.O_WRONLY, 0660)
if err != nil {
fmt.Println("Error while opening destination file :", err)
return err
}
ioReader, ioWriter := io.Pipe()
go func() {
defer func() {
srcFile.Close()
ioWriter.Close()
}()
read, err := io.Copy(ioWriter, srcFile)
if err != nil {
fmt.Println("Error while reading :", err)
return
}
fmt.Printf("Read %v bytes.\n", read)
}()
gzipWriter := gzip.NewWriter(dstFile)
written, err := io.Copy(gzipWriter, ioReader)
if err != nil {
fmt.Println("Error while writing :", err)
return err
}
fmt.Printf("Wrote %v bytes.\n", written)
ioReader.Close()
gzipWriter.Close()
dstFile.Close()
return nil
}
func unpackGzipFile(srcFilePath, dstFilePath string) error {
gzFile, err := os.Open(srcFilePath)
if err != nil {
fmt.Println("Error while opening source file :", err)
return err
}
dstFile, err := os.OpenFile(dstFilePath, os.O_CREATE|os.O_WRONLY, 0660)
if err != nil {
fmt.Println("Error while opening destination file :", err)
return err
}
ioReader, ioWriter := io.Pipe()
go func() {
gzReader, _ := gzip.NewReader(gzFile)
defer func() {
gzFile.Close()
gzReader.Close()
ioWriter.Close()
}()
read, err := io.Copy(ioWriter, gzReader)
if err != nil {
fmt.Println("Error while reading :", err)
return
}
fmt.Printf("Read %v bytes.\n", read)
}()
written, err := io.Copy(dstFile, ioReader)
if err != nil {
fmt.Println("Error while writing :", err)
return err
}
fmt.Printf("Wrote %v bytes.\n", written)
ioReader.Close()
dstFile.Close()
return nil
}
func main() {
if len(os.Args) < 4 {
fmt.Println("USAGE: ./gztool <c | d> <input_filename> <output_filename> [cpu | mem]")
return
}
if len(os.Args) == 5 {
switch os.Args[4] {
case "cpu":
defer profile.Start().Stop()
case "mem":
defer profile.Start(profile.MemProfile).Stop()
}
}
mode := strings.ToLower(os.Args[1])
inputFilename := os.Args[2]
outputFilename := os.Args[3]
switch mode {
case "c":
packGzipFile(inputFilename, outputFilename)
case "d":
unpackGzipFile(inputFilename, outputFilename)
default:
fmt.Println("Invalid mode. Use \"c\" or \"d\"")
}
}
Stats:
Action | CPU Time | Memory Usage |
---|---|---|
Packing | 6.21 s | 793.25 kB |
Unpacking | 300.91 ms | nil^ |
^ - The process completed before go profile could capture any data.
Conclusion
The takeaway is to consider all edge cases - not just in terms of time complexity, but also in terms of space complexity.
Also, Go has several tools like pprof
and vet
that would help in diagnosing common performance problems.
Top comments (0)