利用Python处理netcdf数据

利用Python处理netcdf数据

简介

NetCDF(Network Common Data Format)由UCAR(University Corporation for Atmospheric Research)设计提出,其官方的定义是:NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data.

NetCDF是面向多维数组的数据集,一个NetCDF文件主要是Dimensions, Variables, Attributes, Data 四个部分组成的:

  • Dimension主要是对维度的定义说明,例如:经度,维度,时间等

  • Variables是对数据表示的现象的说明,例如:温度,湿度,高程等;

  • Attributes是一些辅助的元信息说明,例如变量的单位等;

  • Data是主要对现象的观测数据集。

NetCDF有两个数据模型:经典模型(NetCDF3之前模型)和增强模型(NetCDF4)

NetCDF最新版本是NetCDF4,NetCDF4的API接口建立在HDF5之上,和HDF5是兼容的.

安装

1
pip install netCDF4

简单介绍

1. 创建/打开/关闭netCDF文件

1
2
3
4
5
import netCDF4 as nc

rootgrp = nc.Dataset('test.nc', 'w', format='NETCDF4')
print rootgrp.data_model
rootgrp.close()

2. 在netCDF文件中的组

1
2
3
4
5
6
7
8
rootgrp = nc.Dataset('test.nc', 'a')
fcstgrp = rootgrp.createGroup('forecasts')
analgrp = rootgrp.createGroup('analyses')
print(rootgrp.groups)

//像unix那样创建组
fcstgrp1 = rootgrp.createGroup('/forecasts/model1')
fcstgrp2 = rootgrp.createGroup('/forecasts/model2')

3. 在netCDF文件中的维度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
level = rootgrp.createDimension('level', None)
time = rootgrp.createDimension('time', None)
lat = rootgrp.createDimension('lat', 73)
lon = rootgrp.createDimension('lon', 144)

print rootgrp.dimensions #dict

print(len(lon))

print lon.isunlimited()

print time.isunlimited()

for dimobj in rootgrp.dimensions.values():
print(dimobj)

4. 在netCDF文件中的变量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
times = rootgrp.createVariable('time', 'f8', ('time', ))
levels = rootgrp.createVariable('level', 'i4', ('level', ))
latitudes = rootgrp.createVariable('lat', 'f4', ('lat', ))
longitudes = rootgrp.createVariable('lon', 'f4', ('lon', ))

#二维
temp =rootgrp.createVariable('temp', 'f4', ('time', 'level', 'lat', 'lon',))
ftmp =rootgrp.createVariable('/forecasts/model1/temp', 'f4', ('time', 'level', 'lat', 'lon',))

print(rootgrp['/forecasts/model1'])

print(rootgrp['/forecasts/model1/temp'])

print(rootgrp.variables) #dict

5. netCDF文件的属性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import time
rootgrp.description = 'bogus example script'
rootgrp.history = 'Create ' + time.ctime(time.time())
rootgrp.source = 'netCDF4 Python module tutorial'
latitudes.units = 'degrees north'
longitudes.units = 'degrees east'
levels.units = 'hPa'
temp.units = 'K'
times.units = 'hours since 0001-01-01 00:00:00.0'
times.calendar = 'gregorian'

for name in rootgrp.ncattrs():
print('Global attr', name, "=", getattr(rootgrp,name))

print(rootgrp.__dict__)

6. 从netCDF变量中读取和恢复数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import numpy
lats = numpy.arange(-90, 91, 2.5)
lons = numpy.arange(-180, 180, 2.5)
latitudes[:] = lats
longitudes[:] = lons

print('latitudes =\n', latitudes[:])

nlats = len(rootgrp.dimensions['lat'])
nlons = len(rootgrp.dimensions['lon'])

print('temp shape before adding data =', temp.shape)

from numpy.random import uniform
temp[0:5, 0:10, :, :] = uniform(size=(5, 10, nlats, nlons))

print('temp shape after adding data =', temp.shape)

print('levels shape after adding pressure data =', levels.shape)

levels[:] = [1000.,850.,700.,500.,300.,250.,200.,150.,100.,50.]

temp[0, 0, [0, 1, 2, 3], [0, 1, 2, 3]]

tempdat = temp[::2, [1, 3, 6], lats>0, lons>0]

print('shape of fancy temp slice = ', tempdat.shape)

7. 处理时间坐标

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from datetime import datetime, timedelta
from netCDF4 import num2date, date2num

dates = [datetime(2001, 3, 1) + n * timedelta(hours=12)
for n in range(temp.shape[0])]

times[:] = date2num(dates, units=times.units, calendar=times.calendar)

print('time values (in units %s): ' % times.units + '\n', times[:])

#time values (in units hours since January 1, 0001) : [ 17533056. 17533068. 17533080. 17533092. 17533104.]

dates = num2date(times[:], units=times.units, calendar=times.calendar)

print('dates correspoding to time values:\n',dates)

8. 从一个netCDF数据集中读取数据

1
2
3
4
5
6
7
8
9
10
for nf in range(10):
f = np.Dataset('mftest%s.nc' % nf, 'w')
f.createDimension('x', None)
x = f.createVariable('x', 'i', ('x',))
x[0:10] = numpy.arange(nf * 10, 10 * (nf + 1))
f.close()

from netCDF4 import MFDataset
f = MFDataset('mftest*nc')
print(f.variables['x'][:])

9. 关于netCDF变量的有效压缩

1
2
3
4
5
temp = rootgrp.createVariable('temp', 'f4', ('time', 'level', 'lat', 'lon',))

temp = rootgrp.createVariable('temp', 'f4', ('time', 'level', 'lat', 'lon',), zlib=True)

temp = rootgrp.createVariable('temp', 'f4', ('time', 'level', 'lat', 'lon',), zlib=True, least_significantdigis=3)

10. 除固定类型的同构数组之外的——复合数据类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
f = nc.Dataset('complex.nc', 'w')
size = 3

datac = numpy.exp(1j * (1. + numpy.linspace(0, numpy.pi, size)))

complex128 = numpy.dtype([('real', numpy.float64), ('imag', numpy.float64)])

complex128_t = f.createCompoundType(complex128, 'complex128')
f.createDimension('x_dim', None)
v = createVariable('cmplx_var', complex128_t, 'x_dim')
data = numpy.empty(size, complex128)
data['real'] = datac.real; data['imag'] = datac.imag
v[:] = data
f.close(); f = nc.Dataset('complex.nc')
v = f.variables['cmplx_var']
datain = v[:]

datac2 = numpy.empty(datain.shape, numpy.complex128)

datac2.real = datain['real']; datac2.imag = datain['imag']

print(datac.dtype, datatc)

print(datac2.dtype, datac2)

print(f)

11. 可变(vlen)数据类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
f = nc.Dataset("tst_vlen.nc", "w")
vlen_t = f.createVLType(numpy.int32, 'phnoy_vlen')

x = f.createDimension('x', 3)
y = f.createDimension('y', 4)
vlvar = f.createVariable('phnoy_vlen_var', vlen_t, ('y', 'x'))

import random
data = numpy.empty(len(y)*len(x),object)
for n in range(len(y)*len(x)):
data[n] = numpy.arange(random.randint(1,10),dtype="int32")+1
data = numpy.reshape(data,(len(y),len(x)))
vlvar[:] = data
print("vlen variable =\n",vlvar[:])

print(f)

print(f.variables["phony_vlen_var"])

print(f.VLtypes["phony_vlen"])

z = f.createDimension("z",10)
strvar = rootgrp.createVariable("strvar", str, "z")
chars = "1234567890aabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

data = numpy.empty(10,"O")
for n in range(10):
stringlen = random.randint(2,12)
data[n] = "".join([random.choice(chars) for i in range(stringlen)])
strvar[:] = data
print("variable-length string variable:\n",strvar[:])

print(f)

print(f.variables["strvar"])

12. 枚举数据类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
nc = Dataset('clouds.nc','w')
# python dict with allowed values and their names.
enum_dict = {u'Altocumulus': 7, u'Missing': 255,
u'Stratus': 2, u'Clear': 0,
u'Nimbostratus': 6, u'Cumulus': 4, u'Altostratus': 5,
u'Cumulonimbus': 1, u'Stratocumulus': 3}
# create the Enum type called 'cloud_t'.
cloud_type = nc.createEnumType(numpy.uint8,'cloud_t',enum_dict)
print(cloud_type)

time = nc.createDimension('time',None)
# create a 1d variable of type 'cloud_type'.
# The fill_value is set to the 'Missing' named value.
cloud_var =
nc.createVariable('primary_cloud',cloud_type,'time',
fill_value=enum_dict['Missing'])
# write some data to the variable.
cloud_var[:] = [enum_dict['Clear'],enum_dict['Stratus'],
enum_dict['Cumulus'],enum_dict['Missing'],
enum_dict['Cumulonimbus']]
nc.close()
# reopen the file, read the data.
nc = Dataset('clouds.nc')
cloud_var = nc.variables['primary_cloud']
print(cloud_var)

print(cloud_var.datatype.enum_dict)

print(cloud_var[:])

13. 并行IO

1
2
3
4
5
6
7
8
9
from mpi4py import MPI
import numpy as np
from netCDF4 import Dataset
rank = MPI.COMM_WORLD.rank # The process ID (integer 0-3 for 4-process run)
nc = Dataset('parallel_tst.nc','w',parallel=True)
d = nc.createDimension('dim',4)
v = nc.createVariable('var', numpy.int, 'dim')
v[rank] = rank
nc.close()

14. 字符串处理

1
2
3
4
5
6
7
8
9
10
11
12
f = nc.Dataset("stringtest.nc", "w", format='NETCDF4_CLASSIC')

x = f.createDimension('nchars', 3)
y = f.createDimension('nstrings', None)
v = f.createVariable('strings', 'S1', ('nstrings', 'nchars'))
datain = numpy.array(['foo', 'bar'], dtype='S3')
V[:] = stringtochar(datain)
V[:]
V_Encoding = 'ascii'
V[:] = datain
V[:]
f.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
nc = Dataset('compoundstring_example.nc','w')
dtype = numpy.dtype([('observation', 'f4'),
('station_name','S80')])
station_data_t = nc.createCompoundType(dtype,'station_data')
nc.createDimension('station',None)
statdat = nc.createVariable('station_obs', station_data_t, ('station',))
data = numpy.empty(2,dtype)
data['observation'][:] = (123.,3.14)
data['station_name'][:] = ('Boulder','New York')
statdat.dtype # strings actually stored as character arrays
{'names':['observation','station_name'], 'formats':['<f4',('S1', (80,))], 'offsets':[0,4], 'itemsize':84, 'aligned':True}
statdat[:] = data # strings converted to character arrays internally
statdat[:] # character arrays converted back to strings
[(123. , 'Boulder') ( 3.14, 'New York')]
statdat[:].dtype
{'names':['observation','station_name'], 'formats':['<f4','S80'], 'offsets':[0,4], 'itemsize':84, 'aligned':True}
statdat.set_auto_chartostring(False) # turn off auto-conversion
statdat[:] = data.view(dtype=[('observation', 'f4'),('station_name','S1',10)])
statdat[:] # now structured array with char array subtype is returned
[(123. , ['B', 'o', 'u', 'l', 'd', 'e', 'r', '', '', ''])
( 3.14, ['N', 'e', 'w', ' ', 'Y', 'o', 'r', 'k', '', ''])]
nc.close()