exploration.org 21.4 KB

Exploration des données TERLAB

Accès à un fichier TERLAB

Trouver le fichier d’un département

Le nom des fichiers zippés est RPG_TERLAB_DEP65-66_2017.7z ou RPG_TERLAB_DEP10_2017.7z


terlabdir = '/home/inglada/stok/DATA/OSO/MAA_SSP/'
import glob
def find_terlab_file(dpt):
    tld = terlabdir
    tlfiles = glob.glob('{}/*.7z'.format(tld))
    filefound = None
    for f in tlfiles:
        tok = f.split('/')[-1].split('_')[2]
        if tok == 'Corse' and (dpt=='2A' or dpt=='2B'):
            return f
        else:
            tok = tok[3:]
            if len(tok) == 2 and tok == str(dpt):
                return f
            else:
                toks = tok.split('-')
                if str(dpt) in toks:
                    return f
    return None

assert '/home/inglada/stok/DATA/OSO/MAA_SSP/RPG_TERLAB_DEP90-91-93-94-95_2017.7z'==find_terlab_file('94')
assert '/home/inglada/stok/DATA/OSO/MAA_SSP/RPG_TERLAB_DEP90-91-93-94-95_2017.7z'==find_terlab_file('95')
assert '/home/inglada/stok/DATA/OSO/MAA_SSP/RPG_TERLAB_DEP31_2017.7z'==find_terlab_file('31')
assert '/home/inglada/stok/DATA/OSO/MAA_SSP/RPG_TERLAB_Corse_2017.7z'==find_terlab_file('2A')

Mettre le fichier dans un data frame geopandas

import subprocess
import os
import geopandas as gp
def shape2df(departement):
    zipfile = find_terlab_file(str(departement))
    devnull = open(os.devnull, 'w')
    subprocess.call(["7z", "x", zipfile, "-aoa"], stdout=devnull)
    shapefile = '{}/SURFACES-2017-PARCELLES-GRAPHIQUES-CONSTATEES_0{}_20180210.shp'.format(terlabdir, str(departement))
    return gp.read_file(shapefile)

Quelles sont les variables intéressantes dans les données TERLAB

On vérifie que le fichier est bien lu

df31 = shape2df(31)
print(df31.head())
CODE_CULTU  SURF_ADM PRECISION  SEMENCE DEST_ICHN                        ...                             LIBCULTURE SURUTISOL  RENDNORME MMEAU                                           geometry
0        ORH      1.42      None        0      None                        ...                          03_ORGE_HIVER       NaN        NaN   NaN  POLYGON ((506016.382600002 6234988.195600003, ...
1        PPH      0.00      None        0      None                        ...                          00_XXXXXXXXXX       NaN        NaN   NaN  POLYGON ((506032.0680000037 6235002.251000002,...
2        PPH      0.00      None        0      None                        ...                          00_XXXXXXXXXX       NaN        NaN   NaN  POLYGON ((506108.3540000021 6234766.824000001,...
3        ORH      1.87      None        0      None                        ...                          03_ORGE_HIVER       NaN        NaN   NaN  POLYGON ((506107.6996000037 6234765.226100001,...
4        PPH      0.00      None        0      None                        ...                          00_XXXXXXXXXX       NaN        NaN   NaN  POLYGON ((506874.6119000018 6234495.103300001,...

[5 rows x 18 columns]

On liste les colonnes les lignes

print(df31.columns)
Index(['CODE_CULTU', 'SURF_ADM', 'PRECISION', 'SEMENCE', 'DEST_ICHN',
       'CULTURE_D1', 'CULTURE_D2', 'BIO', 'ENGAGEMENT', 'MARAICHAGE',
       'AGROFOREST', 'TLENQ', 'CODUTISOL', 'LIBCULTURE', 'SURUTISOL',
       'RENDNORME', 'MMEAU', 'geometry'],
      dtype='object')

Une ligne

print(df31.iloc[0])
CODE_CULTU                                                  ORH
SURF_ADM                                                   1.42
PRECISION                                                  None
SEMENCE                                                       0
DEST_ICHN                                                  None
CULTURE_D1                                                 None
CULTURE_D2                                                 None
BIO                                                           0
ENGAGEMENT                                                 None
MARAICHAGE                                                    0
AGROFOREST                                                 None
TLENQ                                                         1
CODUTISOL                                                    03
LIBCULTURE                                        03_ORGE_HIVER
SURUTISOL                                                   NaN
RENDNORME                                                   NaN
MMEAU                                                       NaN
geometry      POLYGON ((506016.382600002 6234988.195600003, ...
Name: 0, dtype: object

Quels sont les codes des cultures?

codes_cultures = set(df31.CODE_CULTU)
print(len(codes_cultures))
print(codes_cultures)
167
{'PCL', 'LO7', 'PH7', 'AGR', 'SAI', 'VE7', 'MCT', 'DTY', 'PHI', 'MOT', 'PPO', 'J6S', 'BTN', 'LIH', 'FLP', 'MPA', 'FNU', 'NOX', 'LUZ', 'CHT', 'OIG', 'CCT', 'MCR', 'POR', 'ME7', 'VRC', 'PAS', 'PVP', 'HBL', 'LOT', 'MLT', 'BTA', 'ML6', 'CEL', 'PH6', 'ANE', 'AUB', 'SGH', 'LIP', 'ORT', 'J5M', 'TRN', 'FAG', 'HAR', 'PFR', 'SRS', 'CHU', 'FEV', 'TR6', 'CAR', 'VES', 'BDH', 'ART', 'TOM', 'PPP', 'MH7', 'LEC', 'FRA', 'SA6', 'TTP', 'NOS', 'RGA', 'TR5', 'PFP', 'LH6', 'PCH', 'ORH', 'LH7', 'RDI', 'PAN', 'BFS', 'FET', 'EPE', 'MPC', 'ME6', 'BDP', 'SA7', 'SOJ', 'CPL', 'PAG', 'J6P', 'GFP', 'RVI', 'LBF', 'CPH', 'MH6', 'LU6', 'LU5', 'BOR', 'CZP', 'MC5', 'LUD', 'TCR', 'PPR', 'SA5', 'TRU', 'PP7', 'PPA', 'FNO', 'CGO', 'AVP', 'TR7', 'LAV', 'VRG', 'BTP', 'TRE', 'CML', 'PTC', 'MIE', 'ML5', 'SNE', 'MH5', 'PTR', 'CZH', 'LU7', 'FF7', 'CCN', 'LDH', 'CRD', 'MC6', 'CHS', 'OAG', 'OLI', 'VRT', 'BOP', 'SGE', 'GES', 'PPH', 'SPH', 'BVF', 'CTG', 'AIL', 'FVL', 'CID', 'NVT', 'MLG', 'BRO', 'PRL', 'FLA', 'SPL', 'SBO', 'ORP', 'BTH', 'MC7', 'TTH', 'CAG', 'AVH', 'BFP', 'ML7', 'POT', 'TOP', 'PSL', 'FSG', 'CIT', 'MID', 'LP7', 'PFH', 'MIS', 'CES', 'CMB', 'PEP', 'TAB', 'VE6', 'MLO', 'MOH', 'SOG', 'ROQ'}

Mais il est plus lisible de regarder LIBCULTURE

libs_cultures = set(df31.LIBCULTURE)
print(len(libs_cultures))
print(libs_cultures)
20
{'11_POIS_PROTEAGINEUX', '62_MELANGE_PROTEAGINEUX', '12_FEVE_FEVEROLE', '00_XXXXXXXXXX', '31_POMME_DE_TERRE_CONSO', '14_SOJA', '03_ORGE_HIVER', '01_BLE_TENDRE', '13_TOURNESOL', '21_BETTERAVE_INDUSTRIELLE', '04_ORGE_PRINTEMPS', '07_TRITICALE', '09_COLZA', '05_AVOINE', '15_SORGHO', '06_SEIGLE', '61_MELANGE_CEREALES', '02_BLE_DUR', '17_MAIS_GRAIN', '20_MAIS_FOURRAGE'}

Mais il y a beaucoup plus de CODE_CULTU que de LIBCULTURE. Peut-être que le 2è n’existe que pour les parcelles avec rendement?

Lien entre les 2 champs?

codes_libs = df31[['CODE_CULTU', 'LIBCULTURE']].drop_duplicates()
print(len(codes_libs))
167

Répartition des cultures

df31.LIBCULTURE.value_counts()
00_XXXXXXXXXX                83046
01_BLE_TENDRE                10423
13_TOURNESOL                 10073
02_BLE_DUR                    8268
17_MAIS_GRAIN                 5470
03_ORGE_HIVER                 3012
09_COLZA                      2090
14_SOJA                       1973
20_MAIS_FOURRAGE              1447
15_SORGHO                     1404
07_TRITICALE                  1201
11_POIS_PROTEAGINEUX          1021
12_FEVE_FEVEROLE               470
62_MELANGE_PROTEAGINEUX        324
05_AVOINE                      293
61_MELANGE_CEREALES            290
04_ORGE_PRINTEMPS              251
31_POMME_DE_TERRE_CONSO        157
06_SEIGLE                       19
21_BETTERAVE_INDUSTRIELLE       14
Name: LIBCULTURE, dtype: int64

Histogramme valeurs de rendement:

df31.RENDNORME.describe()
count    4713.000000
mean       46.262873
std        20.104306
min         1.000000
25%        27.840000
50%        48.000000
75%        63.000000
max       120.000000
Name: RENDNORME, dtype: float64
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
df31.hist(column='RENDNORME')
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figureR5oWbq.png

Valeurs manquantes de rendement?

num_parcelles = len(df31.RENDNORME)
num_rendements = df31.RENDNORME.count()
print('{} parcelles dont {} avec information de rendement'.format(num_parcelles, num_rendements))
131246 parcelles dont 4713 avec information de rendement

Disponibiité des données de rendement par culture

df31[df31.RENDNORME.notnull()]['LIBCULTURE'].value_counts()
13_TOURNESOL               1178
02_BLE_DUR                 1125
01_BLE_TENDRE               935
17_MAIS_GRAIN               294
09_COLZA                    258
03_ORGE_HIVER               250
14_SOJA                     241
11_POIS_PROTEAGINEUX        136
15_SORGHO                   118
07_TRITICALE                 70
12_FEVE_FEVEROLE             57
20_MAIS_FOURRAGE             17
05_AVOINE                    15
04_ORGE_PRINTEMPS            14
06_SEIGLE                     4
31_POMME_DE_TERRE_CONSO       1
Name: LIBCULTURE, dtype: int64
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
df31[df31.RENDNORME.notnull()]['CODE_CULTU'].value_counts().plot.bar()
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figureJIfLg1.png

Distribution statisique des rendements par culture

rnd_counts = df31[df31.RENDNORME.notnull()]['CODE_CULTU'].value_counts()
maj6_rnd = rnd_counts.iloc[:6]
fig, axes = plt.subplots(3, 2, sharex=True, sharey=True)
print(axes)
for current_cult, axis in zip(maj6_rnd.index, axes.flatten()):
    axis.set_xlabel(current_cult)
    df31[df31['CODE_CULTU']==current_cult].hist(column='RENDNORME', ax=axis)
    axis.set_title('')
plt.suptitle('RENDNORME')
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figureukzYe2.png

Distribution spatiale des rendements

Ne garder que les parcelles avec une seule culture

Le champ SURF_ADM est la surface de la parcelle. Le champ SURUTISOL est la surface totale de toutes les parcelles de la même culture dans la même exploitation. S’il n’y a qu’une parcelle de cette culture dans cette exploitation, ces 2 surfaces sont très proches (ou égales). Le rendemenet moyen pour toutes les parcelles de l’exploitation de même culture est donné par RENDNORME.

df31par = df31.query('RENDNORME.notnull() & SURUTISOL.notnull() & (abs(SURUTISOL - SURF_ADM)/(SURF_ADM) < 0.05)')
print(df31par[['SURF_ADM', 'SURUTISOL']].head())
print('------------')
print(df31par['LIBCULTURE'].value_counts())
SURF_ADM  SURUTISOL
78       27.69      27.00
644       3.42       3.50
937      15.87      16.00
5115      1.05       1.06
5303     19.41      20.13
------------
01_BLE_TENDRE              23
03_ORGE_HIVER              20
14_SOJA                    20
11_POIS_PROTEAGINEUX       19
17_MAIS_GRAIN              18
13_TOURNESOL               18
09_COLZA                   12
15_SORGHO                  11
07_TRITICALE                7
12_FEVE_FEVEROLE            7
02_BLE_DUR                  6
04_ORGE_PRINTEMPS           4
05_AVOINE                   3
06_SEIGLE                   2
20_MAIS_FOURRAGE            1
31_POMME_DE_TERRE_CONSO     1
Name: LIBCULTURE, dtype: int64

Histogramme des rendements par culture sur les parcelles pures

rnd_counts = df31par['CODE_CULTU'].value_counts()
maj6_rnd = rnd_counts.iloc[:6]
fig, axes = plt.subplots(3, 2, sharex=True, sharey=True)
print(axes)
for current_cult, axis in zip(maj6_rnd.index, axes.flatten()):
    axis.set_xlabel(current_cult)
    df31par[df31par['CODE_CULTU']==current_cult].hist(column='RENDNORME', ax=axis)
    axis.set_title('')
plt.suptitle('RENDNORME')
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figurevurUbh.png

Ajouter une colonne avec le centroïde de la parcelle

df31par['centroid'] = df31par.geometry.centroid
print(df31par.centroid.head())
78      POINT (576943.0567814494 6248166.951789126)
644     POINT (573025.7548245641 6264003.533300648)
937     POINT (572496.3795754968 6266017.564098726)
5115    POINT (560246.9060276946 6236853.034810145)
5303    POINT (583985.3814635368 6245950.934973895)
dtype: object
576943.0567814494
import matplotlib.pyplot as plt
plt.figure(figsize=(200,100))
df31par.plot(column='RENDNORME', cmap='viridis')
#plt.show()
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figure4XvQFY.png

Il n’y a pas de corrélation visible entre rendement et position dans le département.

Corrélation entre rendement et taille de la parcelle

colordict = {lc:i for i, lc in enumerate(set(df31par['LIBCULTURE']))}
print(colordict)
colors = [colordict[c] for c in df31par['LIBCULTURE']]
{'14_SOJA': 0, '02_BLE_DUR': 1, '03_ORGE_HIVER': 2, '01_BLE_TENDRE': 3, '11_POIS_PROTEAGINEUX': 4, '13_TOURNESOL': 5, '04_ORGE_PRINTEMPS': 6, '07_TRITICALE': 7, '15_SORGHO': 8, '09_COLZA': 9, '12_FEVE_FEVEROLE': 10, '17_MAIS_GRAIN': 11, '20_MAIS_FOURRAGE': 12, '31_POMME_DE_TERRE_CONSO': 13, '05_AVOINE': 14, '06_SEIGLE': 15}
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.figure(figsize=(10,5))
plt.scatter(df31par['SURF_ADM'], df31par['RENDNORME'], c=colors, cmap=cm.get_cmap('tab20',len(colordict)))
cbar = plt.colorbar()
cbar.ax.get_yaxis().set_ticks([])
for lab, j in colordict.items():
    cbar.ax.text(.5, j*0.95, lab, ha='left')
plt.xlabel('SURF_ADM')
plt.ylabel('REND')
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figureUjRq6C.png

Il semble y avoir une corrélation positive entre rendement et taille de la parcelle, conditionnée par la classe

rnd_counts = df31par['LIBCULTURE'].value_counts()
maj6_rnd = rnd_counts.iloc[:6]
01_BLE_TENDRE           23
03_ORGE_HIVER           20
14_SOJA                 20
11_POIS_PROTEAGINEUX    19
17_MAIS_GRAIN           18
13_TOURNESOL            18
Name: LIBCULTURE, dtype: int64
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import numpy as np

class SeabornFig2Grid():

    def __init__(self, seaborngrid, fig,  subplot_spec):
        self.fig = fig
        self.sg = seaborngrid
        self.subplot = subplot_spec
        if isinstance(self.sg, sns.axisgrid.FacetGrid) or \
            isinstance(self.sg, sns.axisgrid.PairGrid):
            self._movegrid()
        elif isinstance(self.sg, sns.axisgrid.JointGrid):
            self._movejointgrid()
        self._finalize()

    def _movegrid(self):
        """ Move PairGrid or Facetgrid """
        self._resize()
        n = self.sg.axes.shape[0]
        m = self.sg.axes.shape[1]
        self.subgrid = gridspec.GridSpecFromSubplotSpec(n,m, subplot_spec=self.subplot)
        for i in range(n):
            for j in range(m):
                self._moveaxes(self.sg.axes[i,j], self.subgrid[i,j])

    def _movejointgrid(self):
        """ Move Jointgrid """
        h= self.sg.ax_joint.get_position().height
        h2= self.sg.ax_marg_x.get_position().height
        r = int(np.round(h/h2))
        self._resize()
        self.subgrid = gridspec.GridSpecFromSubplotSpec(r+1,r+1, subplot_spec=self.subplot)

        self._moveaxes(self.sg.ax_joint, self.subgrid[1:, :-1])
        self._moveaxes(self.sg.ax_marg_x, self.subgrid[0, :-1])
        self._moveaxes(self.sg.ax_marg_y, self.subgrid[1:, -1])

    def _moveaxes(self, ax, gs):
        #https://stackoverflow.com/a/46906599/4124317
        ax.remove()
        ax.figure=self.fig
        self.fig.axes.append(ax)
        self.fig.add_axes(ax)
        ax._subplotspec = gs
        ax.set_position(gs.get_position(self.fig))
        ax.set_subplotspec(gs)

    def _finalize(self):
        plt.close(self.sg.fig)
        self.fig.canvas.mpl_connect("resize_event", self._resize)
        self.fig.canvas.draw()

    def _resize(self, evt=None):
        self.sg.fig.set_size_inches(self.fig.get_size_inches())
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns; sns.set()
from scipy import stats

plotlist = []
for current_cult in maj6_rnd.index:
    dfcurr = df31par.query('LIBCULTURE == "{}"'.format(current_cult))
    g = sns.jointplot('SURF_ADM', 'RENDNORME', data=dfcurr, kind='reg').set_axis_labels('SURF_'+current_cult,'REND')
    rsquare = lambda a, b: stats.spearmanr(a, b)[0] ** 2
    g.annotate(rsquare, template="{stat}: {val:.2f}", stat="$Spearman R^2$", loc="upper left", fontsize=12)
    plotlist.append(g)

fig = plt.figure(figsize=(10,10))

gs = gridspec.GridSpec(3, 2)

for i, p in enumerate(plotlist):
    SeabornFig2Grid(p, fig, gs[i])

gs.tight_layout(fig)
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figureYa6zlX.png

La corrélation n’est valable que pour le blé tendre et le tournesol. Il faudra évidemment refaire tout ça avec tous les départements.

Corrélation avec autres variables (qui n’ont pas besoin des images)

Irrigation

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns; sns.set()
from scipy import stats

plotlist = []
for current_cult in maj6_rnd.index:
    dfcurr = df31par.query('LIBCULTURE == "{}"'.format(current_cult))
    g = sns.jointplot('MMEAU', 'RENDNORME', data=dfcurr, kind='reg').set_axis_labels('IRR_'+current_cult,'REND')
    rsquare = lambda a, b: stats.spearmanr(a, b)[0] ** 2
    g.annotate(rsquare, template="{stat}: {val:.2f}", stat="$Spearman R^2$", loc="upper left", fontsize=12)
    plotlist.append(g)

fig = plt.figure(figsize=(10,10))

gs = gridspec.GridSpec(3, 2)

for i, p in enumerate(plotlist):
    SeabornFig2Grid(p, fig, gs[i])

gs.tight_layout(fig)
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figureVmssHE.png

Bio

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns; sns.set()
from scipy import stats

plotlist = []
for current_cult in maj6_rnd.index:
    dfcurr = df31par.query('LIBCULTURE == "{}"'.format(current_cult))
    g = sns.jointplot('BIO', 'RENDNORME', data=dfcurr, kind='reg').set_axis_labels('BIO_'+current_cult,'REND')
    rsquare = lambda a, b: stats.spearmanr(a, b)[0] ** 2
    g.annotate(rsquare, template="{stat}: {val:.2f}", stat="$Spearman R^2$", loc="upper left", fontsize=12)
    plotlist.append(g)

fig = plt.figure(figsize=(10,10))

gs = gridspec.GridSpec(3, 2)

for i, p in enumerate(plotlist):
    SeabornFig2Grid(p, fig, gs[i])

gs.tight_layout(fig)
plt.savefig(matplot_lib_filename)
matplot_lib_filename

/tmp/babel-X17w0V/figureE5mY6F.png

Différence entre surface admin et vraie surface

Latitude

Quels sont les départements contenant des informations de rendement?

Quelles sont les cultures pour lesquelles les informations de rendement sont les plus nombreuses?

Peut-on prédire le rendement à partir des données TERLAB seules?

  • Localisation, culture, surface de la parcelle

Quelles sont les tuiles S2 nécessaires pour couvrir chaque département?