from sklearn.feature_extraction import DictVectorizer

measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
    {'city': 'Dubai', 'temperature': 40.},
    {'temperature': 20.}, # miss a categorical feature
    {'city': 'Dubai'} # missed numerical feature, use default value 0
]

vec = DictVectorizer()

vec.fit_transform(measurements).toarray() # numpy.ndarray

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.],
       [ 1.,  0.,  0., 40.],
       [ 0.,  0.,  0., 20.],
       [ 1.,  0.,  0.,  0.]])


vec.get_feature_names() # get a list of feature names

['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']


movie_entry = [{'category': ['thriller', 'drama'], 'year': 2003},
               {'category': ['animation', 'family'], 'year': 2011},
               {'category': [], 'year': 2011}, # empty list of categorical features
               {'year': 1974}, # miss a categorical feature
               {'category': ['thriller']}, # missed numerical feature, use default value 0
]

vec = DictVectorizer()

vec.fit_transform(movie_entry).toarray() # numpy.ndarray

array([[0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.003e+03],
       [1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 2.011e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 2.011e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.974e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00]])


vec.get_feature_names() # get a list of feature names

['category=animation',
 'category=drama',
 'category=family',
 'category=thriller',
 'year']


from sklearn.feature_extraction import FeatureHasher

measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
    {'city': 'Dubai', 'temperature': 40.},
    {'temperature': 20.}, # miss a categorical feature
    {'city': 'Dubai'} # missed numerical feature, use default value 0
]

h = FeatureHasher(n_features=4)

h.fit_transform(measurements).toarray()

array([[  0., -33.,  -1.,   0.],
       [  0., -12.,  -1.,   0.],
       [ -1., -18.,   0.,   0.],
       [  0., -40.,  -1.,   0.],
       [  0., -20.,   0.,   0.],
       [  0.,   0.,  -1.,   0.]])


from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

# 1-gram
vectorizer = CountVectorizer()

vectorizer.fit_transform(corpus).toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])


vectorizer.get_feature_names() # get a list of feature names

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


# 2-gram
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), # 1-gram and 2-gram, default is (1, 1)
                        token_pattern=r'\b\w+\b', # token pattern, default r”(?u)\b\w\w+\b”
                        # ignore terms that have a document frequency strictly lower than the given threshold
                        #float, the parameter represents a proportion of documents, integer, absolute counts
                        min_df=1
                        )
bigram_vectorizer.fit_transform(corpus).toarray()

array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]])


bigram_vectorizer.get_feature_names() # get a list of feature names

['and',
 'and the',
 'document',
 'first',
 'first document',
 'is',
 'is the',
 'is this',
 'one',
 'second',
 'second document',
 'second second',
 'the',
 'the first',
 'the second',
 'the third',
 'third',
 'third one',
 'this',
 'this is',
 'this the']


corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

# 1-gram
vectorizer = CountVectorizer()

counts = vectorizer.fit_transform(corpus).toarray()

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)

transformer.fit_transform(counts).toarray()

array([[0.        , 0.43306685, 0.56943086, 0.43306685, 0.        ,
        0.        , 0.33631504, 0.        , 0.43306685],
       [0.        , 0.24014568, 0.        , 0.24014568, 0.        ,
        0.89006176, 0.18649454, 0.        , 0.24014568],
       [0.56115953, 0.        , 0.        , 0.        , 0.56115953,
        0.        , 0.23515939, 0.56115953, 0.        ],
       [0.        , 0.43306685, 0.56943086, 0.43306685, 0.        ,
        0.        , 0.33631504, 0.        , 0.43306685]])


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

vectorizer.fit_transform(corpus).toarray()

array([[0.        , 0.43877674, 0.54197657, 0.43877674, 0.        ,
        0.        , 0.35872874, 0.        , 0.43877674],
       [0.        , 0.27230147, 0.        , 0.27230147, 0.        ,
        0.85322574, 0.22262429, 0.        , 0.27230147],
       [0.55280532, 0.        , 0.        , 0.        , 0.55280532,
        0.        , 0.28847675, 0.55280532, 0.        ],
       [0.        , 0.43877674, 0.54197657, 0.43877674, 0.        ,
        0.        , 0.35872874, 0.        , 0.43877674]])


from sklearn.feature_extraction.text import HashingVectorizer

hv = HashingVectorizer(n_features=10)

hv.transform(corpus).toarray() # same outputs as hv.fit_transform(corpus).toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , -0.57735027,  0.57735027, -0.57735027,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.81649658,  0.        ,  0.40824829, -0.40824829,  0.        ],
       [ 0.        ,  0.5       ,  0.        ,  0.        , -0.5       ,
        -0.5       ,  0.        ,  0.        , -0.5       ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , -0.57735027,  0.57735027, -0.57735027,  0.        ]])


import pandas as pd
data = pd.read_csv('housing.csv')

from sklearn.model_selection import train_test_split
import numpy as np
data_X = data.drop(['median_house_value'], axis = 1) # DataFrame
data_Y = data['median_house_value'] # Series

# split the dataset by categories of median income for stratify split
data['income_cat'] = np.ceil(data['median_income']/1.5)
data['income_cat'].where(data['income_cat'] < 5, 5, inplace=True)

train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data['income_cat'])


train_X.head()


vectorizer = CountVectorizer()

vectorizer.fit_transform(train_X['ocean_proximity']) # accept a Series

vectorizer.fit_transform(train_X[['ocean_proximity']]) # not a DataFrame, return a 1*1 matrix

<1x1 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('one_hot', CountVectorizer()),
])

from sklearn.compose import ColumnTransformer

preprocess_pipeline = ColumnTransformer([
        ("num_pipeline", num_pipeline, ['longitude','latitude']), # pass a DataFrame into num_pipeline
        ("cat_pipeline", cat_pipeline, 'ocean_proximity'), # pass a Series into cat_pipeline
    ])

preprocess_pipeline.fit_transform(train_X)

array([[-1.15604281,  0.77194962,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-1.17602483,  0.6596948 ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.18684903, -1.34218285,  0.        , ...,  0.        ,
         1.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-1.43579109,  0.99645926,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	ocean_proximity
17606	-121.89	37.29	38.0	1568.0	351.0	710.0	339.0	2.7042	<1H OCEAN
18632	-121.93	37.05	14.0	679.0	108.0	306.0	113.0	6.4214	<1H OCEAN
14650	-117.20	32.77	31.0	1952.0	471.0	936.0	462.0	2.8621	NEAR OCEAN
3230	-119.61	36.31	25.0	1847.0	371.0	1460.0	353.0	1.8839	INLAND
3555	-118.59	34.23	17.0	6592.0	1525.0	4459.0	1463.0	3.0347	<1H OCEAN

Feature Extraction¶

DictVectorizer¶

FeatureHasher¶

Bag of Words¶

n-gram¶

Stop Words¶

TF-IDF Weighting¶

CountVectorizer¶

TfidfTransformer¶

TfidfVectorizer¶

HashingVectorizer¶

Processing Pandas DataFrame¶

Reference¶